<a href="https://colab.research.google.com/github/hamidzangiabadi/LSTM-HHO-IoMT-Colab/blob/colab/LSTMHHOWithFeatureEngineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.feature_selection import mutual_info_classif
import os
import warnings
import gc
import time

warnings.filterwarnings('ignore')

def setup_device():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        device = torch.device('cuda:0')
        print("="*70)
        print(f"🎮 GPU: {torch.cuda.get_device_name(0)}")
        print("="*70 + "\n")
        return device
    return torch.device('cpu')

# ==================== حذف ویژگی‌های Leaky ====================
def remove_leaky_features(df: pd.DataFrame):
    """حذف ویژگی‌های که باعث Data Leakage می‌شوند"""
    print("="*70)
    print("🔒 REMOVING POTENTIALLY LEAKY FEATURES")
    print("="*70)

    # ستون‌های مشکوک به Leakage
    leaky_patterns = ['Unnamed', 'index', 'id', 'ID', 'Index', 'Number']

    original_cols = df.columns.tolist()
    cols_to_drop = []

    for col in original_cols:
        for pattern in leaky_patterns:
            if pattern.lower() in col.lower():
                cols_to_drop.append(col)
                break

    if cols_to_drop:
        print(f"⚠️  Dropping {len(cols_to_drop)} suspicious columns:")
        for col in cols_to_drop:
            print(f"   - {col}")
        df = df.drop(columns=cols_to_drop, errors='ignore')
    else:
        print("✅ No suspicious columns found")

    print("="*70 + "\n")
    return df

# ==================== تحلیل کیفیت داده (ساده‌شده) ====================
def analyze_data_simple(df: pd.DataFrame):
    """تحلیل ساده داده"""
    print("="*70)
    print("📊 DATA ANALYSIS")
    print("="*70)

    print(f"\n✅ Shape: {df.shape}")

    # توزیع کلاس
    print("\n📊 Class Distribution:")
    label_counts = df['label'].value_counts()
    print(label_counts)
    print("\nPercentage:")
    print((label_counts / len(df) * 100).round(2))

    # Missing values
    missing = df.isnull().sum().sum()
    print(f"\n✅ Missing values: {missing}")

    print("="*70 + "\n")

# ==================== Feature Selection ====================
def select_features_robust(X: pd.DataFrame, y: np.ndarray, k: int = 20):
    """انتخاب ویژگی با MI (بدون ستون‌های ID-like)"""
    print("="*70)
    print("🎯 FEATURE SELECTION (Robust)")
    print("="*70)

    mi_scores = mutual_info_classif(X, y, random_state=42)
    mi_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

    print(f"\n📊 Top {min(k, len(mi_scores))} features:")
    print(mi_scores.head(k))

    # انتخاب features با MI معنادار
    threshold = 0.01  # حداقل MI
    selected = mi_scores[mi_scores > threshold].head(k).index.tolist()

    print(f"\n✅ Selected {len(selected)} features (MI > {threshold})")
    print("="*70 + "\n")

    return selected

# ==================== Dataset ====================
class CleanCIC2024Dataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray, num_classes: int, augment: bool = False):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y.astype(np.int64), dtype=torch.long)
        self.num_classes = num_classes
        self.augment = augment

        unique, counts = np.unique(y, return_counts=True)
        self.class_weights = torch.zeros(num_classes)
        for cls, count in zip(unique, counts):
            self.class_weights[cls] = len(y) / (num_classes * count)

        self.sample_weights = torch.tensor([self.class_weights[label].item() for label in y])

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x, y = self.X[idx], self.y[idx]

        if self.augment and torch.rand(1).item() > 0.5:
            noise = torch.randn_like(x) * 0.01
            x = x + noise

        return x, y

    def get_sample_weights(self):
        return self.sample_weights

# ==================== پیش‌پردازش نهایی ====================
def preprocess_final(
    file_path: str,
    seq_len: int = 10,
    batch_size: int = 128,
    top_k_features: int = 20
):
    print("="*70)
    print("🔧 FINAL PREPROCESSING (No Leakage)")
    print("="*70 + "\n")

    df = pd.read_csv(file_path)
    print(f"✅ Loaded: {len(df):,} rows\n")

    # حذف ویژگی‌های Leaky
    df = remove_leaky_features(df)

    # تحلیل
    analyze_data_simple(df)

    df_clean = df.dropna()

    # جداسازی X و y
    X = df_clean.select_dtypes(include=[np.number]).drop(columns=['label'], errors='ignore')
    y = df_clean['label'].values

    print(f"✅ Features before selection: {X.shape[1]}")

    # Encoding
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    num_classes = len(np.unique(y_encoded))

    # Feature Selection (بدون ID columns)
    selected_features = select_features_robust(X, y_encoded, k=top_k_features)
    X = X[selected_features]

    print(f"✅ Features after selection: {X.shape[1]}")
    print(f"✅ Classes: {num_classes}")
    print(f"✅ Samples: {len(X):,}\n")

    # Scaling
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = np.nan_to_num(X_scaled, nan=0.0, posinf=0.0, neginf=0.0)

    # Sequences
    X_seq, y_seq = [], []
    for i in range(len(X_scaled) - seq_len):
        X_seq.append(X_scaled[i:i+seq_len])
        y_seq.append(y_encoded[i+seq_len])

    X_seq = np.array(X_seq, dtype=np.float32)
    y_seq = np.array(y_seq, dtype=np.int64)

    # Stratified Split
    X_temp, X_test, y_temp, y_test = train_test_split(
        X_seq, y_seq, test_size=0.2, random_state=42, stratify=y_seq
    )

    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp
    )

    print(f"✅ Train: {len(X_train):,}, Val: {len(X_val):,}, Test: {len(X_test):,}\n")

    # Datasets
    train_dataset = CleanCIC2024Dataset(X_train, y_train, num_classes, augment=True)
    val_dataset = CleanCIC2024Dataset(X_val, y_val, num_classes, augment=False)
    test_dataset = CleanCIC2024Dataset(X_test, y_test, num_classes, augment=False)

    sampler = WeightedRandomSampler(
        weights=train_dataset.get_sample_weights(),
        num_samples=len(train_dataset),
        replacement=True
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

    input_size = X_seq.shape[2]
    class_weights = train_dataset.class_weights

    print(f"⚖️  Class Weights: {class_weights}")
    print("="*70 + "\n")

    return train_loader, val_loader, test_loader, input_size, num_classes, label_encoder, class_weights

# ==================== مدل ====================
class SimpleLSTMModel(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, num_layers: int, num_classes: int, dropout: float = 0.3):
        super(SimpleLSTMModel, self).__init__()

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=True
        )

        self.bn = nn.BatchNorm1d(hidden_size * 2)
        self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.bn(out)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

# ==================== آموزش ====================
def train_final(model, train_loader, val_loader, device, class_weights, epochs=20, lr=0.001):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.5)

    best_val_acc = 0.0
    patience_counter = 0

    for epoch in range(epochs):
        # Train
        model.train()
        train_correct, train_total = 0, 0

        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device, non_blocking=True)
            y_batch = y_batch.to(device, non_blocking=True)

            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            _, predicted = torch.max(outputs, 1)
            train_total += y_batch.size(0)
            train_correct += (predicted == y_batch).sum().item()

        train_acc = 100.0 * train_correct / train_total

        # Val
        model.eval()
        val_correct, val_total = 0, 0
        all_preds, all_labels = [], []

        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device, non_blocking=True)
                y_batch = y_batch.to(device, non_blocking=True)

                outputs = model(X_batch)
                _, predicted = torch.max(outputs, 1)
                val_total += y_batch.size(0)
                val_correct += (predicted == y_batch).sum().item()

                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(y_batch.cpu().numpy())

        val_acc = 100.0 * val_correct / val_total
        val_f1 = f1_score(all_labels, all_preds, average='weighted') * 100

        scheduler.step(val_acc)

        print(f"Epoch {epoch+1}/{epochs} | Train: {train_acc:.2f}% | Val: {val_acc:.2f}% | F1: {val_f1:.2f}%")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model_clean.pth')
        else:
            patience_counter += 1
            if patience_counter >= 5:
                print(f"\n⚠️  Early Stopping")
                break

    model.load_state_dict(torch.load('best_model_clean.pth'))
    return best_val_acc

# ==================== ارزیابی ====================
def evaluate_final(model, test_loader, device, label_encoder):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(device, non_blocking=True)
            y_batch = y_batch.to(device, non_blocking=True)

            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    print("\n" + "="*70)
    print("📊 FINAL TEST RESULTS")
    print("="*70)
    print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

    cm = confusion_matrix(all_labels, all_preds)
    print("\nConfusion Matrix:")
    print(cm)

    accuracy = 100.0 * (np.array(all_preds) == np.array(all_labels)).sum() / len(all_labels)
    f1 = f1_score(all_labels, all_preds, average='weighted') * 100

    print(f"\n✅ Test Accuracy: {accuracy:.2f}%")
    print(f"✅ Test F1-Score: {f1:.2f}%")
    print("="*70 + "\n")

    return accuracy, f1

# ==================== Main ====================
def main():
    print("\n" + "🎯"*35)
    print("CLEAN LSTM (No Data Leakage)")
    print("🎯"*35 + "\n")

    device = setup_device()

    train_loader, val_loader, test_loader, input_size, num_classes, label_encoder, class_weights = \
        preprocess_final(
            file_path='final_sample_dataset.csv',
            seq_len=10,
            batch_size=128,
            top_k_features=20
        )

    print("🔨 Building Model...")
    model = SimpleLSTMModel(
        input_size=input_size,
        hidden_size=64,
        num_layers=2,
        num_classes=num_classes,
        dropout=0.3
    )

    print("\n🚀 Training...")
    start = time.time()

    val_acc = train_final(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        device=device,
        class_weights=class_weights,
        epochs=20,
        lr=0.001
    )

    elapsed = time.time() - start
    print(f"\n⏱️  Training Time: {elapsed:.2f}s")

    test_acc, test_f1 = evaluate_final(model, test_loader, device, label_encoder)

    print(f"🏆 FINAL:")
    print(f"   Val Acc: {val_acc:.2f}%")
    print(f"   Test Acc: {test_acc:.2f}%")
    print(f"   Test F1: {test_f1:.2f}%\n")

if __name__ == "__main__":
    main()



🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯
CLEAN LSTM (No Data Leakage)
🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯

🎮 GPU: Tesla T4

🔧 FINAL PREPROCESSING (No Leakage)

✅ Loaded: 38,582 rows

🔒 REMOVING POTENTIALLY LEAKY FEATURES
⚠️  Dropping 10 suspicious columns:
   - Unnamed: 0.1
   - Unnamed: 0
   - fin_flag_number
   - syn_flag_number
   - rst_flag_number
   - psh_flag_number
   - ack_flag_number
   - ece_flag_number
   - cwr_flag_number
   - Number

📊 DATA ANALYSIS

✅ Shape: (38582, 32)

📊 Class Distribution:
label
-1.0    19291
 1.0    19291
Name: count, dtype: int64

Percentage:
label
-1.0    50.0
 1.0    50.0
Name: count, dtype: float64

✅ Missing values: 0

✅ Features before selection: 31
🎯 FEATURE SELECTION (Robust)

📊 Top 20 features:
Header_Length    0.669052
Tot size         0.662800
AVG              0.662725
Tot sum          0.661843
Max              0.653617
IAT              0.652756
Rate             0.651521
ack_count        0.634988
Time_To_Live     0.598907
Std              0.590

TypeError: object of type 'numpy.float64' has no len()

In [4]:
import pandas as pd
import numpy as np

# بررسی توزیع
df = pd.read_csv('final_sample_dataset.csv')
print("📊 توزیع کلاس‌ها:")
print(df['label'].value_counts())
print("\nدرصد:")
print(df['label'].value_counts(normalize=True) * 100)


📊 توزیع کلاس‌ها:
label
-1.0    19291
 1.0    19291
Name: count, dtype: int64

درصد:
label
-1.0    50.0
 1.0    50.0
Name: proportion, dtype: float64
