In [1]:
import pandas as pd
import numpy as np

##### Load Sample Dataset

In [2]:
# Load Sample Dataset
df = pd.read_csv('../data/processed/sample_dataset.csv')
df.shape


(1000, 88)

#### Cleanup Dataset

In [3]:
# Replace infinity with np.nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [4]:
# Remove rows with any NaN values
nan_count = df.isna().sum().sum()
print(nan_count)

# Drop rows with any NaN values
df.dropna(inplace=True)
df.shape

80


(960, 88)

In [5]:
# Remove duplicate rows
duplicate_count = df.duplicated().sum()
print(duplicate_count)

# Drop duplicate rows
df.drop_duplicates(inplace=True)
df.shape


0


(960, 88)

#### Column Processing

In [6]:
# Stripping whitespace from column names and lowercasing
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [7]:
df.columns

Index(['unnamed:_0', 'flow_id', 'source_ip', 'source_port', 'destination_ip',
       'destination_port', 'protocol', 'timestamp', 'flow_duration',
       'total_fwd_packets', 'total_backward_packets',
       'total_length_of_fwd_packets', 'total_length_of_bwd_packets',
       'fwd_packet_length_max', 'fwd_packet_length_min',
       'fwd_packet_length_mean', 'fwd_packet_length_std',
       'bwd_packet_length_max', 'bwd_packet_length_min',
       'bwd_packet_length_mean', 'bwd_packet_length_std', 'flow_bytes/s',
       'flow_packets/s', 'flow_iat_mean', 'flow_iat_std', 'flow_iat_max',
       'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std',
       'fwd_iat_max', 'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean',
       'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags',
       'bwd_psh_flags', 'fwd_urg_flags', 'bwd_urg_flags', 'fwd_header_length',
       'bwd_header_length', 'fwd_packets/s', 'bwd_packets/s',
       'min_packet_length', 'max_packet_length', 'packet_le

#### Feature Seperation

In [8]:
# Seperating X (Input) & Y (Output)
y = df['label']
X = df.drop(columns=['label'])

X.shape, y.shape


((960, 87), (960,))

In [9]:
# Removing non-numeric features
non_numeric_columns = X.select_dtypes(exclude=['number']).columns
print(non_numeric_columns)

if len(non_numeric_columns) > 0:
  X = X.drop(columns=non_numeric_columns)

X.shape

Index(['flow_id', 'source_ip', 'destination_ip', 'timestamp'], dtype='object')


(960, 83)

#### Feature Selection

In [10]:
# A Standard Scaler is a normalization technique in machine learning that transforms numerical features so each of them has a mean of 0 and a standard deviation of 1. This process is known as standardization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [11]:
# Instead of using all 70+ of your scaled numeric features, PCA combines them into a smaller set of new, super-features called "Principal Components." These new features are abstract, but they are engineered to capture the most important information (the most variance) from the original data.
from sklearn.decomposition import PCA

pca = PCA(n_components=24)

X_pca = pca.fit_transform(X_scaled)

X_pca.shape

(960, 24)

#### Train-test Split

In [12]:
from sklearn.model_selection import train_test_split

X_data = X_pca
y_data = y

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.4, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(576, 24)
(384, 24)
(576,)
(384,)


# Model Training

In [19]:
import time
import os
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

results_dir = '../output'
models_dir = '../models'
os.makedirs(results_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)

# Train model
lr_model = LogisticRegression(max_iter=1000)

start_time = time.time()
lr_model.fit(X_train, y_train)
end_time = time.time()

execution_time_lr = end_time - start_time

print("Training Complete")


# Predict the model
y_pred_lr = lr_model.predict(X_test)
y_proba_lr = lr_model.predict_proba(X_test)

# Get all metrics
accuracy_lr = accuracy_score(y_test, y_pred_lr)
# We add 'labels=lr_model.classes_' to tell the scorer about ALL possible classes
auc_lr = roc_auc_score(y_test, y_proba_lr, multi_class='ovr', labels=lr_model.classes_)
report_lr = classification_report(y_test, y_pred_lr)

# Saving the metrics
# --- 6. Save Metrics to a File ---
file_path = os.path.join(results_dir, 'lr_metrics.txt')
with open(file_path, 'w') as f:
    f.write("--- METRICS FOR LOGISTIC REGRESSION ---\n")
    f.write(f"Accuracy: {accuracy_lr * 100:.2f}%\n")
    # ... (rest of the metrics writing) ...
    f.write(report_lr)

print(f"Successfully saved metrics to {file_path}")

# Save the Trained Model
model_path = os.path.join(models_dir, 'lr_model.joblib')
joblib.dump(lr_model, model_path)


Training Complete
Successfully saved metrics to ../output/lr_metrics.txt


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


['../models/lr_model.joblib']