In [14]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import math
import copy

In [15]:
# Model Hyperparameters
SEQUENCE_LENGTH = 10
D_MODEL = 128
N_HEADS = 4
NUM_ENCODER_LAYERS = 3
DROPOUT_RATE = 0.1
LEARNING_RATE = 1e-4
SERVER_FINETUNE_EPOCHS = 3 

In [16]:
# Federated Learning Hyperparameters
NUM_CLIENTS = 5
NUM_ROUNDS = 100     # Number of communication rounds between server and clients
EPOCHS_PER_CLIENT = 1 # Number of local training epochs for each client on its data
BATCH_SIZE = 128
dirichlet_alpha = 100
MU = 1 

In [17]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"--- Using {device.upper()} device for training and evaluation ---")

--- Using CUDA device for training and evaluation ---


In [18]:
# List of all data files to be loaded
files_to_load = [
    'train_basic.csv',
    'test_basic.csv',
    'train_semiurban.csv',
    'test_semiurban.csv',
    'test_rural.csv'
]

In [19]:
list_of_dfs = []
for filename in files_to_load:
    path = f"datasets/{filename}"
    try:
        df = pd.read_csv(path)
        # Optional: Add a column to track the origin of the data
        df['origin'] = filename.split('.')[0]
        list_of_dfs.append(df)
        print(f"Successfully loaded {filename}")
    except FileNotFoundError:
        print(f"Warning: Could not find file {filename}. Skipping.")

  df = pd.read_csv(path)


Successfully loaded train_basic.csv


  df = pd.read_csv(path)


Successfully loaded test_basic.csv


  df = pd.read_csv(path)


Successfully loaded train_semiurban.csv


  df = pd.read_csv(path)


Successfully loaded test_semiurban.csv


  df = pd.read_csv(path)


Successfully loaded test_rural.csv


In [20]:
for i in list_of_dfs:
    print(i.shape)

(43204, 473)
(43204, 473)
(11116, 3568)
(10803, 3568)
(43206, 1897)


In [21]:
common_columns = set(list_of_dfs[0].columns)

for df in list_of_dfs[1:]:
    common_columns.intersection_update(df.columns)

common_columns = list(common_columns)
if 'malicious' not in common_columns:
    print("Warning: 'malicious' column not common to all files.")

print(f"\nFound {len(common_columns)} features common to ALL datasets. Using these for training.")

# Filter all dataframes to use ONLY the common columns
list_of_cleaned_dfs = [df[common_columns] for df in list_of_dfs]


Found 219 features common to ALL datasets. Using these for training.


In [22]:
list_of_cleaned_dfs = [df[common_columns] for df in list_of_dfs]

In [23]:
full_df = pd.concat(list_of_cleaned_dfs, ignore_index=True)
print(f"✅ All datasets combined using common features. Full dataset shape: {full_df.shape}")

✅ All datasets combined using common features. Full dataset shape: (151533, 219)


In [24]:
def map_malicious_to_numeric(value):
    val_str = str(value).strip()
    if val_str == 'False': return 0
    elif 'benign' in val_str: return 1
    elif val_str.isdigit(): return 2
    else: return 0

full_df['malicious'] = full_df['malicious'].apply(map_malicious_to_numeric)

In [25]:
full_df.shape

(151533, 219)

In [26]:
# Remove constant numerical columns based on the full dataset
std_dev_full = full_df.std(numeric_only=True)
cols_to_drop = std_dev_full[std_dev_full == 0].index.tolist()

if cols_to_drop:
    print(f"Found and dropping {len(cols_to_drop)} constant numerical columns.")
    full_df = full_df.drop(columns=cols_to_drop)
else:
    print("No constant numerical columns found.")

Found and dropping 28 constant numerical columns.


In [27]:
full_df.shape

(151533, 191)

In [28]:
full_df['malicious'].value_counts()

malicious
0    131433
2     18332
1      1768
Name: count, dtype: int64

In [29]:
# Unify all non-numeric columns to be strings
features_to_unify = full_df.select_dtypes(exclude=np.number).columns
features_to_unify = features_to_unify.drop('malicious', errors='ignore')

if not features_to_unify.empty:
    for col in features_to_unify:
        full_df[col] = full_df[col].astype(str)


In [30]:
# Creating Data Splits
X = full_df.drop(columns=['malicious'])
y = full_df['malicious']

X_pool, X_test, y_pool, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

X_client_pool, X_server, y_client_pool, y_server = train_test_split(
    X_pool, y_pool,
    test_size=0.05,
    random_state=42,
    stratify=y_pool 
)

print(f"Server fine-tuning set created with {len(X_server)} samples.")
print(f"Server set distribution:\n{y_server.value_counts(normalize=True)}")


pool_df = pd.concat([X_pool, y_pool], axis=1)

# --- Dirichlet distribution logic ---
num_classes = len(pool_df['malicious'].unique())

class_distribution = np.random.dirichlet([dirichlet_alpha] * num_classes, NUM_CLIENTS)

class_dfs = [pool_df[pool_df['malicious'] == i] for i in range(num_classes)]

# Distribute the data to clients according to the generated recipes
client_dfs = []
for client_id in range(NUM_CLIENTS):
    client_df_list = []
    for class_id in range(num_classes):
        # Get the recipe for this client and class
        proportion = class_distribution[client_id, class_id]
        # Calculate how many samples of this class the client gets
        num_samples = int(proportion * len(class_dfs[class_id]))
        # Take a random sample of that size
        class_sample = class_dfs[class_id].sample(num_samples, random_state=42)
        client_df_list.append(class_sample)
    # Combine the samples from all classes into this client's final dataset
    client_dfs.append(pd.concat(client_df_list))

print(f"✅ Data partitioned into {len(client_dfs)} non-IID client datasets using a Dirichlet distribution.")

Server fine-tuning set created with 5304 samples.
Server set distribution:
malicious
0    0.867270
2    0.121041
1    0.011689
Name: proportion, dtype: float64
✅ Data partitioned into 5 non-IID client datasets using a Dirichlet distribution.


In [31]:
# Verify the non-IID distribution
for i, client_df in enumerate(client_dfs):
    print(f"Client {i+1} data distribution:\n{client_df['malicious'].value_counts(normalize=True)}\n")

Client 1 data distribution:
malicious
0    0.873604
2    0.115420
1    0.010975
Name: proportion, dtype: float64

Client 2 data distribution:
malicious
0    0.857639
2    0.129694
1    0.012667
Name: proportion, dtype: float64

Client 3 data distribution:
malicious
0    0.869908
2    0.116689
1    0.013403
Name: proportion, dtype: float64

Client 4 data distribution:
malicious
0    0.890514
2    0.098408
1    0.011077
Name: proportion, dtype: float64

Client 5 data distribution:
malicious
0    0.880293
2    0.108839
1    0.010868
Name: proportion, dtype: float64



In [32]:
# Preprocess Data (Scale and Encode)
numerical_features = X_pool.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_pool.select_dtypes(exclude=np.number).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler()) # Using MinMaxScaler for [0, 1] range
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

In [33]:
preprocessor.fit(X_pool)
# Transform the global test set
X_test_processed = preprocessor.transform(X_test)

In [34]:
# Preparing Sequences 
def create_supervised_sequences(X_data, y_data, seq_len):
    sequences, labels = [], []
    for i in range(len(X_data) - seq_len):
        sequences.append(X_data[i:i + seq_len])
        labels.append(y_data.iloc[i + seq_len - 1])
    return np.array(sequences), np.array(labels)

In [35]:
# Transformer Classifier Model
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class TransformerClassifier(nn.Module):
    def __init__(self, feature_size, d_model, n_heads, num_encoder_layers, num_classes, dropout_rate):
        super(TransformerClassifier, self).__init__()
        self.d_model = d_model
        self.input_embedding = nn.Linear(feature_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout_rate)
        encoder_layers = nn.TransformerEncoderLayer(d_model, n_heads, d_model * 4, dropout_rate, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        self.decoder = nn.Linear(d_model, num_classes)

    def forward(self, src):
        src = self.input_embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        # We take the output of the last time step to represent the whole sequence
        prediction = self.decoder(output[:, -1, :])
        return prediction

In [36]:
def client_update_fedavg(client_model, optimizer, criterion, train_loader, epochs):
    """Simulates a client training locally."""
    client_model.train()
    for epoch in range(epochs):
        for seqs, labels in train_loader:
            seqs, labels = seqs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = client_model(seqs)
            loss = criterion(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(client_model.parameters(), max_norm=1.0)
            optimizer.step()
    return client_model.state_dict()

def federated_average(model_weights):
    """Averages the weights from multiple client models."""
    avg_weights = copy.deepcopy(model_weights[0])
    for key in avg_weights.keys():
        for i in range(1, len(model_weights)):
            avg_weights[key] += model_weights[i][key]
        avg_weights[key] = torch.div(avg_weights[key], len(model_weights))
    return avg_weights

In [37]:
N_FEATURES = X_test_processed.shape[1] # Get feature count from processed data
NUM_CLASSES = len(y.unique())

global_model = TransformerClassifier(
    feature_size=N_FEATURES, d_model=D_MODEL, n_heads=N_HEADS,
    num_encoder_layers=NUM_ENCODER_LAYERS, num_classes=NUM_CLASSES, dropout_rate=DROPOUT_RATE
).to(device)

In [38]:
# This new function takes arguments and has the fix for the IndexError
def evaluate_global_model(model, preprocessor, X_test, y_test, sequence_length, batch_size):
    """
    Evaluates the final trained global model on the hold-out global test set.
    """
    print("\n--- Evaluating Global Model on Test Set ---")
    model.eval()
    device = next(model.parameters()).device

    # --- Prepare the Test Data ---
    X_test_processed = preprocessor.transform(X_test)

    # This is the inner function with the fix
    def create_supervised_sequences_robust(X_data, y_data, seq_len):
        sequences, labels = [], []
        # THE FIX: Reset the index of the y_data Series to ensure safe positional access
        y_data_reset = y_data.reset_index(drop=True)
        
        for i in range(len(X_data) - seq_len):
            sequences.append(X_data[i:i + seq_len])
            # Use the reset Series for lookup
            labels.append(y_data_reset.iloc[i + seq_len - 1])
        return np.array(sequences), np.array(labels)

    X_test_seq, y_test_seq = create_supervised_sequences_robust(X_test_processed, y_test, sequence_length)
    
    test_dataset = TensorDataset(torch.tensor(X_test_seq, dtype=torch.float32), torch.tensor(y_test_seq, dtype=torch.long))
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # --- Make Predictions ---
    all_preds = []
    all_true = []
    with torch.no_grad():
        for seqs, labels in test_loader:
            seqs = seqs.to(device)
            outputs = model(seqs)
            _, predicted_labels = torch.max(outputs, 1)
            all_preds.extend(predicted_labels.cpu().numpy())
            all_true.extend(labels.numpy())

    # --- Display Results ---
    target_names = ['Normal (0)', 'Benign (1)', 'Attack (2)']
    print("\n--- Classification Report (Global Model) ---")
    print(classification_report(all_true, all_preds, target_names=target_names))


In [None]:
for round_idx in range(NUM_ROUNDS):
    print(f"\n--- Communication Round {round_idx+1}/{NUM_ROUNDS} ---")
    local_weights = []

    for client_id in range(NUM_CLIENTS):
        client_X = client_dfs[client_id].drop(columns=['malicious'])
        client_y = client_dfs[client_id]['malicious']
        X_client_processed = preprocessor.transform(client_X)
        X_client_seq, y_client_seq = create_supervised_sequences(X_client_processed, client_y, SEQUENCE_LENGTH)
        client_dataset = TensorDataset(torch.tensor(X_client_seq, dtype=torch.float32), torch.tensor(y_client_seq, dtype=torch.long))
        client_loader = DataLoader(client_dataset, batch_size=BATCH_SIZE, shuffle=True)
        
        local_model = copy.deepcopy(global_model).to(device)
        
        class_counts = client_y.value_counts().sort_index()
        weights = torch.ones(NUM_CLASSES, dtype=torch.float32)
        for class_idx, count in class_counts.items():
            if class_idx < len(weights):
                weights[class_idx] = 1.0 / count if count > 0 else 1.0
        weights = weights / weights.sum()
        criterion = nn.CrossEntropyLoss(weight=weights.to(device))
        optimizer = torch.optim.Adam(local_model.parameters(), lr=LEARNING_RATE)

        print(f"Client {client_id+1} training...")
        # Call the standard FedAvg update function
        updated_weights = client_update_fedavg(local_model, optimizer, criterion, client_loader, epochs=EPOCHS_PER_CLIENT)
        local_weights.append(updated_weights)
    
    print("Server aggregating client model weights...")
    global_weights = federated_average(local_weights)
    global_model.load_state_dict(global_weights)

    # --- NEW: Server Fine-Tuning Step ---
    print("Server fine-tuning the global model...")
    # Prepare the server's small, balanced dataset
    X_server_processed = preprocessor.transform(X_server)
    X_server_seq, y_server_seq = create_supervised_sequences(X_server_processed, y_server, SEQUENCE_LENGTH)
    server_dataset = TensorDataset(torch.tensor(X_server_seq, dtype=torch.float32), torch.tensor(y_server_seq, dtype=torch.long))
    server_loader = DataLoader(server_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    # Get the optimizer and criterion for the server fine-tuning step
    optimizer = torch.optim.Adam(global_model.parameters(), lr=LEARNING_RATE / 10) # Use a smaller LR for fine-tuning
    criterion = nn.CrossEntropyLoss() # No class weights needed as the data is balanced

    # Fine-tune the global model for a few epochs
    global_model.train()
    for _ in range(SERVER_FINETUNE_EPOCHS):
        for seqs, labels in server_loader:
            seqs, labels = seqs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = global_model(seqs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    if (round_idx + 1) % 10 == 0:
        evaluate_global_model(
    model=global_model,
    preprocessor=preprocessor,
    X_test=X_test,
    y_test=y_test,
    sequence_length=SEQUENCE_LENGTH,
    batch_size=BATCH_SIZE
)


print("\n--- Federated Training Complete ---")



--- Communication Round 1/100 ---
Client 1 training...
Client 2 training...
Client 3 training...
Client 4 training...
Client 5 training...
Server aggregating client model weights...
Server fine-tuning the global model...

--- Communication Round 2/100 ---
Client 1 training...
Client 2 training...
Client 3 training...
Client 4 training...
Client 5 training...
Server aggregating client model weights...
Server fine-tuning the global model...

--- Communication Round 3/100 ---
Client 1 training...
Client 2 training...
Client 3 training...
Client 4 training...
Client 5 training...
Server aggregating client model weights...
Server fine-tuning the global model...

--- Communication Round 4/100 ---
Client 1 training...
Client 2 training...
Client 3 training...
Client 4 training...
Client 5 training...
Server aggregating client model weights...
Server fine-tuning the global model...

--- Communication Round 5/100 ---
Client 1 training...
Client 2 training...
Client 3 training...
Client 4 train