In [None]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Load dataset
data = pd.read_csv("../data/cicids2017_cleaned.csv")
data.head()

# Check data types
data.info()
data.describe()

In [None]:
# cleaning & encoding
df = data.copy()

# detect label column
if "Label" in df.columns:
    label_col = "Label"
elif "Attack Type" in df.columns:
    label_col = "Attack Type"
else:
    raise KeyError(f"No label column found. Available columns: {df.columns.tolist()}")

# binary target: benign/normal -> 0, attack -> 1
benign_values = {"benign", "normal traffic", "normal"}
labels_norm = df[label_col].astype(str).str.strip().str.lower()
df["target"] = np.where(labels_norm.isin(benign_values), 0, 1)

# features/target
X = df.drop([label_col, "target"], axis=1)
y = df["target"]

df["target"].value_counts()

In [None]:
# split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# supervised model (Random Forest)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

# evaluation
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# ROC curve
y_proba = rf.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0,1],[0,1], linestyle="--")
plt.legend()
plt.show()

In [None]:
# unsupervised model (Isolation Forest)
iso = IsolationForest(contamination=0.1)
iso.fit(X_train)

pred_iso = iso.predict(X_test)

In [None]:
# backup model
joblib.dump(rf, "../models/rf_model.pkl")