In [1]:
import pandas as pd
import numpy as np

##### Load Sample Dataset

In [2]:
# Load Sample Dataset
df = pd.read_csv('../data/processed/sample_dataset.csv')
df.shape


(1000, 88)

#### Cleanup Dataset

In [3]:
# Replace infinity with np.nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [4]:
# Remove rows with any NaN values
nan_count = df.isna().sum().sum()
print(nan_count)

# Drop rows with any NaN values
df.dropna(inplace=True)
df.shape

80


(960, 88)

In [5]:
# Remove duplicate rows
duplicate_count = df.duplicated().sum()
print(duplicate_count)

# Drop duplicate rows
df.drop_duplicates(inplace=True)
df.shape


0


(960, 88)

#### Column Processing

In [6]:
# Stripping whitespace from column names and lowercasing
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [7]:
df.columns

Index(['unnamed:_0', 'flow_id', 'source_ip', 'source_port', 'destination_ip',
       'destination_port', 'protocol', 'timestamp', 'flow_duration',
       'total_fwd_packets', 'total_backward_packets',
       'total_length_of_fwd_packets', 'total_length_of_bwd_packets',
       'fwd_packet_length_max', 'fwd_packet_length_min',
       'fwd_packet_length_mean', 'fwd_packet_length_std',
       'bwd_packet_length_max', 'bwd_packet_length_min',
       'bwd_packet_length_mean', 'bwd_packet_length_std', 'flow_bytes/s',
       'flow_packets/s', 'flow_iat_mean', 'flow_iat_std', 'flow_iat_max',
       'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std',
       'fwd_iat_max', 'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean',
       'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags',
       'bwd_psh_flags', 'fwd_urg_flags', 'bwd_urg_flags', 'fwd_header_length',
       'bwd_header_length', 'fwd_packets/s', 'bwd_packets/s',
       'min_packet_length', 'max_packet_length', 'packet_le

#### Feature Seperation

In [8]:
# Seperating X (Input) & Y (Output)
y = df['label']
X = df.drop(columns=['label'])

X.shape, y.shape


((960, 87), (960,))

In [9]:
# Removing non-numeric features
non_numeric_columns = X.select_dtypes(exclude=['number']).columns
print(non_numeric_columns)

if len(non_numeric_columns) > 0:
  X = X.drop(columns=non_numeric_columns)

X.shape

Index(['flow_id', 'source_ip', 'destination_ip', 'timestamp'], dtype='object')


(960, 83)

#### Feature Selection

In [10]:
# A Standard Scaler is a normalization technique in machine learning that transforms numerical features so each of them has a mean of 0 and a standard deviation of 1. This process is known as standardization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [11]:
# Instead of using all 70+ of your scaled numeric features, PCA combines them into a smaller set of new, super-features called "Principal Components." These new features are abstract, but they are engineered to capture the most important information (the most variance) from the original data.
from sklearn.decomposition import PCA

pca = PCA(n_components=24)

X_pca = pca.fit_transform(X_scaled)

X_pca.shape

(960, 24)

#### Train-test Split

In [12]:
from sklearn.model_selection import train_test_split

X_data = X_pca
y_data = y

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.4, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(576, 24)
(384, 24)
(576,)
(384,)


# Model Training

In [None]:
import time
import os
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Folder Setup
results_dir = '../output'
models_dir = '../models'
os.makedirs(results_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)



# Train model
models = {
    "Logistic_Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "Random_Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(probability=True),  # probability=True is needed for AUC
    "Naive_Bayes": GaussianNB(),
    "Decision_Tree": DecisionTreeClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

for name, model in models.items():
    try:
        start_time = time.time()
        model.fit(X_train, y_train)
        end_time = time.time()

        execution_time = end_time - start_time

        print(f"Training Complete: {name}")

        # Predict the model
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)

        # Get all metrics
        accuracy = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_proba, multi_class='ovr', labels=model.classes_)
        report = classification_report(y_test, y_pred)

        # Saving the metrics
        file_path = os.path.join(results_dir, f'{name}.txt')
        with open(file_path, 'w') as f:
            f.write(f"--- METRICS FOR {name.upper} ---\n")
            f.write(f"Accuracy: {accuracy * 100:.2f}%\n")
            # ... (rest of the metrics writing) ...
            f.write(report)

        print(f"Successfully saved metrics to {file_path}")

        # Save the Trained Model
        model_path = os.path.join(models_dir, f'{name}.joblib')
        joblib.dump(model, model_path)
    except Exception as e:
        print(f"  -> ERROR training {name}: {e}")


Training Complete: Logistic_Regression
Successfully saved metrics to ../output/Logistic_Regression.txt
Training Complete: KNN
Successfully saved metrics to ../output/KNN.txt
Training Complete: Random_Forest
Successfully saved metrics to ../output/Random_Forest.txt
Training Complete: SVM
Successfully saved metrics to ../output/SVM.txt
Training Complete: Naive_Bayes
Successfully saved metrics to ../output/Naive_Bayes.txt
Training Complete: Decision_Tree
Successfully saved metrics to ../output/Decision_Tree.txt


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Training Complete: AdaBoost
Successfully saved metrics to ../output/AdaBoost.txt
  -> ERROR training XGBoost: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4 5 6], got ['BENIGN' 'LDAP' 'MSSQL' 'NetBIOS' 'Portmap' 'Syn' 'UDP']


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
