## Imports

In [86]:
!pip install scikit-optimizer



In [87]:
import warnings
warnings.filterwarnings("ignore")

In [88]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance
import numpy as np
import pandas as pd
from google.colab import drive
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm.notebook import tqdm
from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm

## Train test split

In [89]:
df=pd.read_csv('./CICIDS2017_sample_km.csv')

In [90]:
df.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,10226
4,4619
10,3178
2,2564
1,1966
12,1507
14,652
3,208
7,155
6,118


In [91]:
mask = (df['Label'] != 0) & (df['Label'] != 6)
df.loc[mask, 'Label'] = 1
df = df.reset_index(drop=True)

In [92]:
df.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,15146
0,10226
6,118


In [93]:
features = df.drop(['Label'],axis=1).dtypes[df.dtypes != 'object'].index
df[features] = df[features].apply(
    lambda x: (x - x.mean()) / (x.std()))

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.fillna(0)

In [94]:
X = df.drop('Label', axis='columns')
y = df['Label']
y=np.ravel(y)
pd.Series(y).value_counts()

Unnamed: 0,count
1,15146
0,10226
6,118


## Feature engineering (IG, FCBF, and KPCA)

In [95]:
from sklearn.feature_selection import mutual_info_classif
importances = mutual_info_classif(X, y)

In [96]:
# calculate the sum of importance scores
f_list = sorted(zip(map(lambda x: round(x, 4), importances), features), reverse=True)
Sum = 0
fs = []
for i in range(0, len(f_list)):
    Sum = Sum + f_list[i][0]
    fs.append(f_list[i][1])

In [97]:
# select the important features from top to bottom until the accumulated importance reaches 90%
f_list2 = sorted(zip(map(lambda x: round(x, 4), importances/Sum), features), reverse=True)
Sum2 = 0
fs = []
for i in range(0, len(f_list2)):
    Sum2 = Sum2 + f_list2[i][0]
    fs.append(f_list2[i][1])
    if Sum2>=0.9:
        break

In [98]:
X_fs = df[fs].values

In [99]:
X_fs.shape

(25490, 45)

#### Feature selection by Fast Correlation Based Filter (FCBF)


In [100]:
!pip install scikit-optimizer



In [101]:
from skopt import gp_minimize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

def fcbf_objective(threshold, X, y, clf=RandomForestClassifier(random_state=42)):
    selector = FCBF(th=threshold)
    X_selected = selector.fit_transform(X, y)
    if X_selected.shape[1] == 0:
        return 1.0
    score = cross_val_score(clf, X_selected, y, cv=3, scoring="accuracy").mean()
    return -score

In [102]:
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i

res = gp_minimize(
    lambda th: fcbf_objective(th[0], X_fs, y),
    dimensions=[(0.01, 0.5)],
    n_calls=20,
    random_state=42,
    acq_func='EI',
)

best_threshold = res.x[0]
print("Best threshold:", best_threshold)
print("Best accuracy:", -res.fun)

Best threshold: 0.09988304703442026
Best accuracy: 0.8891325820397746


In [103]:
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
fcbf = FCBF(th = best_threshold)

In [104]:
X_fss = fcbf.fit_transform(X_fs,y)

####  kernel principal component analysis (KPCA)

In [105]:
from skopt import gp_minimize
from skopt.space import Integer, Categorical
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def kpca_objective(params, X, y):
    n_components, kernel = params
    n_components = min(n_components, X.shape[1] - 1)
    if n_components < 1: n_components = 1
    try:
        kpca = KernelPCA(n_components=n_components, kernel=kernel, fit_inverse_transform=False, random_state=42)
        X_kpca = kpca.fit_transform(X)
        clf = RandomForestClassifier(random_state=42)
        score = cross_val_score(clf, X_kpca, y, cv=3, scoring="accuracy").mean()
        return -score
    except Exception as e:
        return 1.0

search_space = [
    Integer(2, 50),
    Categorical(['rbf', 'poly'])
]

result = gp_minimize(
    lambda params: kpca_objective(params, X_fss, y),
    search_space,
    n_calls=20,
    random_state=42,
    acq_func='EI'
)

best_n_components, best_kernel = result.x
print(f"Best n_components: {best_n_components}, Best kernel: {best_kernel}")
print(f"Best accuracy: {-result.fun:.4f}")

Best n_components: 40, Best kernel: poly
Best accuracy: 0.8892


In [106]:
kpca = KernelPCA(n_components=best_n_components, kernel=best_kernel, random_state=42)
X_kpca = kpca.fit_transform(X_fss)

### Train-test split after feature selection

In [107]:
df_train_idx = df.query('Label == 0').index
df_val_test_idx = df.drop(df_train_idx).index

In [108]:
X_train = X_kpca[df_train_idx]
y_train = np.zeros(len(X_train), dtype=int)

In [109]:
X_train.shape

(10226, 40)

In [110]:
y_train.shape

(10226,)

In [111]:
X_val_test_data = X_kpca[df_val_test_idx]
val_test_labels = df.loc[df_val_test_idx, 'Label']

In [112]:
len(df_val_test_idx)

15264

In [113]:
X_val_test_data.shape

(15264, 40)

In [114]:
val_test_labels.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,15146
6,118


In [115]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

if isinstance(X_val_test_data, (pd.Series, pd.DataFrame)):
    X_val_test_data = X_val_test_data.to_numpy()

if isinstance(val_test_labels, (pd.Series, pd.DataFrame)):
    val_test_labels = val_test_labels.to_numpy()

indices_1 = np.where(val_test_labels == 1)[0]
indices_13 = np.where(val_test_labels == 6)[0]

X_test_label_13 = X_val_test_data[indices_13]
y_test_label_13 = val_test_labels[indices_13]

num_label_13 = len(y_test_label_13)

indices_0_train = np.where(y_train == 0)[0]
num_label_0_train = len(indices_0_train)

np.random.seed(42)
if num_label_0_train >= num_label_13:
    indices_0_test_sample = np.random.choice(indices_0_train, size=num_label_13, replace=False)
else:
    indices_0_test_sample = np.random.choice(indices_0_train, size=num_label_0_train, replace=False)

X_test_label_0 = X_train[indices_0_test_sample]
y_test_label_0 = y_train[indices_0_test_sample]

X_test = np.concatenate([X_test_label_13, X_test_label_0], axis=0)
y_test = np.concatenate([y_test_label_13, y_test_label_0], axis=0)

X_val_label_1 = X_val_test_data[indices_1]
y_val_label_1 = val_test_labels[indices_1]

remaining_indices_0_train = np.setdiff1d(indices_0_train, indices_0_test_sample)

X_val_label_0 = X_train[remaining_indices_0_train]
y_val_label_0 = y_train[remaining_indices_0_train]

num_val_label_0 = len(y_val_label_0)
num_val_label_1 = len(y_val_label_1)

np.random.seed(42)
if num_val_label_1 >= num_val_label_0:
    indices_1_val_sample = np.random.choice(num_val_label_1, size=num_val_label_0, replace=False)
    X_val_label_1_final = X_val_label_1[indices_1_val_sample]
    y_val_label_1_final = y_val_label_1[indices_1_val_sample]
else:
    X_val_label_1_final = X_val_label_1
    y_val_label_1_final = y_val_label_1

X_val = np.concatenate([X_val_label_0, X_val_label_1_final], axis=0)
y_val = np.concatenate([y_val_label_0, y_val_label_1_final], axis=0)

print("Shape of final X_val:", X_val.shape)
print("Shape of final y_val:", y_val.shape)
unique_val, counts_val = np.unique(y_val, return_counts=True)
print("y_val label counts:", dict(zip(unique_val, counts_val)))

print("\nShape of final X_test:", X_test.shape)
print("Shape of final y_test:", y_test.shape)
unique_test, counts_test = np.unique(y_test, return_counts=True)
print("y_test label counts:", dict(zip(unique_test, counts_test)))

Shape of final X_val: (20216, 40)
Shape of final y_val: (20216,)
y_val label counts: {np.int64(0): np.int64(10108), np.int64(1): np.int64(10108)}

Shape of final X_test: (236, 40)
Shape of final y_test: (236,)
y_test label counts: {np.int64(0): np.int64(118), np.int64(6): np.int64(118)}


In [116]:
y_val[y_val != 0] = 1

y_test[y_test != 0] = 1


In [117]:
X_val.shape

(20216, 40)

In [118]:
from sklearn.preprocessing import MinMaxScaler
std_scaler = MinMaxScaler()
std_scaler = std_scaler.fit(X_train)

X_train = std_scaler.transform(X_train)
X_val = std_scaler.transform(X_val)
X_test = std_scaler.transform(X_test)

### Apply the Autoencoder model with biased classifiers

In [119]:
# Implementação do Early Stopping
class EarlyStopping:
  def __init__(self, patience=7, delta=0, verbose=True, path='checkpoint.pt'):
      self.patience = patience
      self.delta = delta
      self.verbose = verbose
      self.counter = 0
      self.early_stop = False
      self.val_min_loss = np.inf
      self.path = path

  def __call__(self, val_loss, model):
    if val_loss < self.val_min_loss - self.delta:   # Caso a loss da validação reduza, vamos salvar o modelo e nova loss mínima
      self.save_checkpoint(val_loss, model)
      self.counter = 0
    else:                                           # Caso a loss da validação NÃO reduza, vamos incrementar o contador da paciencia
      self.counter += 1
      print(f'EarlyStopping counter: {self.counter} out of {self.patience}. Current validation loss: {val_loss:.5f}')
      if self.counter >= self.patience:
          self.early_stop = True

  def save_checkpoint(self, val_loss, model):
    if self.verbose:
        print(f'Validation loss decreased ({self.val_min_loss:.5f} --> {val_loss:.5f}).  Saving model ...')
    torch.save(model.state_dict(), self.path)
    self.val_min_loss = val_loss

In [120]:
class Autoencoder(nn.Module):
    def __init__(self, in_features, dropout_rate=0.2):
        super().__init__()
        self.in_features = in_features
        self.dropout_rate = dropout_rate
        self.early_stopping = None

        self.encoder = nn.Sequential(
            nn.Linear(in_features, 24),
            nn.BatchNorm1d(24),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(24, 8),
            nn.BatchNorm1d(8),
            nn.ReLU()
        )

        self.decoder = nn.Sequential(
            nn.Linear(8, 24),
            nn.BatchNorm1d(24),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(24, in_features),
            nn.Sigmoid()
        )

    def forward(self, X):
        encoded = self.encoder(X)
        decoded = self.decoder(encoded)
        return encoded, decoded

    def compile(self, learning_rate):
        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def fit_transform(self, X_train, num_epochs, batch_size,
                      X_val=None, patience=None, delta=None, device=None):
        """
        Trains the autoencoder and returns the encoded representation of X_train.
        """
        if X_val is not None and patience is not None and delta is not None:
            print(f'Using early stopping with patience={patience} and delta={delta}')
            self.early_stopping = EarlyStopping(patience, delta)

        val_avg_losses = []
        train_avg_losses = []

        for epoch in range(num_epochs):
            train_losses = []
            self.train()
            for batch in range(0, len(X_train), batch_size):
                batch_X = X_train[batch:(batch + batch_size)]
                encoded, decoded = self.forward(batch_X)

                loss = self.criterion(decoded, batch_X)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                train_losses.append(loss.item())

            train_avg_loss = np.mean(train_losses)
            train_avg_losses.append(train_avg_loss)
            print(f'Epoch#{epoch+1}: Train Average Loss = {train_avg_loss:.5f}')

            if self.early_stopping is not None:
                val_losses = []
                self.eval()
                with torch.no_grad():
                    for batch in range(0, len(X_val), batch_size):
                        batch_X = X_val[batch:(batch + batch_size)]
                        _, decoded = self.forward(batch_X)
                        val_loss = self.criterion(decoded, batch_X)
                        val_losses.append(val_loss.item())
                val_avg_loss = np.mean(val_losses)
                val_avg_losses.append(val_avg_loss)
                self.early_stopping(val_avg_loss, self)
                if self.early_stopping.early_stop:
                    print(f'Stopped by early stopping at epoch {epoch+1}')
                    break

        if self.early_stopping is not None:
            self.load_state_dict(torch.load('checkpoint.pt', map_location=device))

        self.eval()
        return self.transform(X_train, batch_size)

    def transform(self, X, batch_size=256):
        """
        Returns the encoded representation of the input X.
        """
        encoded_list = []
        self.eval()
        with torch.no_grad():
            for batch in range(0, len(X), batch_size):
                batch_X = X[batch:(batch + batch_size)]
                encoded, _ = self.forward(batch_X)
                encoded_list.append(encoded.cpu())
        return torch.cat(encoded_list, dim=0)


In [121]:
from skopt import BayesSearchCV
from skopt.space import Integer, Real
from sklearn.base import BaseEstimator

class SklearnAutoencoder(BaseEstimator):
    def __init__(self, in_features, hidden1=24, hidden2=8, dropout=0.2, lr=1e-3, num_epochs=50, batch_size=256):
        self.in_features = in_features
        self.hidden1 = hidden1
        self.hidden2 = hidden2
        self.dropout = dropout
        self.lr = lr
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self._build_model()

    def _build_model(self):
        self.model = Autoencoder(self.in_features, dropout_rate=self.dropout)
        self.model.encoder = nn.Sequential(
            nn.Linear(self.in_features, self.hidden1),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden1, self.hidden2),
            nn.ReLU()
        )
        self.model.decoder = nn.Sequential(
            nn.Linear(self.hidden2, self.hidden1),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden1, self.in_features),
            nn.Sigmoid()
        )
        self.model.to(self.device)
        self.model.compile(self.lr)

    def fit(self, X, y=None):
        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
        _ = self.model.fit_transform(X_tensor, self.num_epochs, self.batch_size)
        return self

    def score(self, X, y=None):
        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
        _, decoded = self.model(X_tensor)
        decoded = decoded.detach().cpu().numpy()
        loss = np.mean((X - decoded) ** 2)
        return -loss


In [122]:
from sklearn.metrics import make_scorer

def ae_scorer(estimator, X, y=None):
    return estimator.score(X)

search_space = {
    "hidden1": Integer(32, 256),   # bigger first hidden layer
    "hidden2": Integer(8, 128),    # bigger bottleneck
    "dropout": Real(0.0, 0.5),
    "lr": Real(1e-4, 5e-3, "log-uniform")
}

opt_ae = BayesSearchCV(
    SklearnAutoencoder(in_features=X_train.shape[1]),
    search_space,
    n_iter=50,
    cv=10,
    n_jobs=-1,
    scoring=ae_scorer,   # use the custom scorer
    random_state=0
)

opt_ae.fit(X_train)
best_ae = opt_ae.best_estimator_.model


Epoch#1: Train Average Loss = 0.22459
Epoch#2: Train Average Loss = 0.15892
Epoch#3: Train Average Loss = 0.04507
Epoch#4: Train Average Loss = 0.00648
Epoch#5: Train Average Loss = 0.00347
Epoch#6: Train Average Loss = 0.00289
Epoch#7: Train Average Loss = 0.00266
Epoch#8: Train Average Loss = 0.00250
Epoch#9: Train Average Loss = 0.00241
Epoch#10: Train Average Loss = 0.00236
Epoch#11: Train Average Loss = 0.00230
Epoch#12: Train Average Loss = 0.00227
Epoch#13: Train Average Loss = 0.00225
Epoch#14: Train Average Loss = 0.00222
Epoch#15: Train Average Loss = 0.00220
Epoch#16: Train Average Loss = 0.00219
Epoch#17: Train Average Loss = 0.00217
Epoch#18: Train Average Loss = 0.00216
Epoch#19: Train Average Loss = 0.00214
Epoch#20: Train Average Loss = 0.00214
Epoch#21: Train Average Loss = 0.00213
Epoch#22: Train Average Loss = 0.00212
Epoch#23: Train Average Loss = 0.00210
Epoch#24: Train Average Loss = 0.00209
Epoch#25: Train Average Loss = 0.00207
Epoch#26: Train Average Loss = 0.0

In [123]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [124]:
def get_overall_metrics(y_true, y_pred):
  tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
  acc = (tp+tn)/(tp+tn+fp+fn)
  tpr = tp/(tp+fn)
  fpr = fp/(fp+tn)
  precision = tp/(tp+fp)
  f1 = (2*tpr*precision)/(tpr+precision)
  return {'acc':acc,'tpr':tpr,'fpr':fpr,'precision':precision,'f1-score':f1}

In [125]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    recall_score
)
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.neighbors import NearestNeighbors
import numpy as np
from skopt import BayesSearchCV
from skopt.space import Integer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_curve, roc_auc_score

def Anomaly_IDS(best_ae, input_dim, X_train, X_test, y_train, y_test, X_val, y_val, n, uncertainty_margin=0.01,b=100, num_epochs=50, batch_size=256):

    ae = best_ae
    X_train_torch = torch.tensor(X_train, dtype=torch.float32)
    _ = ae.fit_transform(X_train_torch, num_epochs, batch_size)

    def get_autoencoder_anomaly_scores(model, X):
        model.eval()
        with torch.no_grad():
            X_torch = torch.tensor(X, dtype=torch.float32).to(device)
            # The model returns (latent_representation, reconstruction)
            _, reconstructed_X = model(X_torch)
            # Calculate Mean Squared Error for each sample
            anomaly_scores = torch.mean(torch.pow(X_torch - reconstructed_X, 2), axis=1)
            return anomaly_scores.detach().cpu().numpy()

    val_anomaly_scores = get_autoencoder_anomaly_scores(ae, X_val)
    fpr, tpr, thresholds = roc_curve(y_val, val_anomaly_scores)

    print('Fpr', fpr)
    print('Tpr', tpr)
    print('Thresholds', thresholds)

    df_roc = pd.DataFrame({
        'fpr': fpr,
        'tpr': tpr,
        'thresholds': thresholds
    })

    df_roc['youden_index'] = df_roc['tpr'] - df_roc['fpr']
    best_row = df_roc.loc[df_roc['youden_index'].idxmax()]
    optimal_threshold = best_row['thresholds']

    test_anomaly_scores = get_autoencoder_anomaly_scores(ae, X_test)

    y_pred_ae = (test_anomaly_scores >= optimal_threshold).astype(int)

    print("Autoencoder Performance (threshold-based):")
    print(classification_report(y_test, y_pred_ae))
    cm_ae = confusion_matrix(y_test, y_pred_ae)
    tn, fp, fn, tp = cm_ae.ravel()
    dr_ae = tp/(tp+fn) if tp+fn>0 else 0
    far_ae = fp/(fp+tn) if fp+tn>0 else 0
    acc_ae = accuracy_score(y_test, y_pred_ae)
    print(f"  Acc: {acc_ae:.4f}, DR: {dr_ae:.4f}, FAR: {far_ae:.4f}\n  CM:\n{cm_ae}")

    val_anomaly_scores = get_autoencoder_anomaly_scores(ae, X_val)

    # 1. Identify False Positives (FP) and False Negatives (FN) from the training set
    fp_idx = np.where((val_anomaly_scores >= optimal_threshold) & (y_val == 0))[0]
    fn_idx = np.where((val_anomaly_scores < optimal_threshold) & (y_val == 1))[0]

    X_fp_train = X_val[fp_idx] # Normal samples misclassified as Attack
    X_fn_train = X_val[fn_idx] # Attack samples misclassified as Normal

    rfp = None
    rfn = None
    cv_splits = 5

    if len(X_fp_train) > 0:
        X_attacks_for_fp = X_train[y_train == 1]
        if len(X_attacks_for_fp) > 0:
            attack_samples_for_fp = X_attacks_for_fp[np.random.choice(len(X_attacks_for_fp), size=len(X_fp_train), replace=True)]
            Xp = np.concatenate([X_fp_train, attack_samples_for_fp])
            yp = np.concatenate([np.zeros(len(X_fp_train)), np.ones(len(attack_samples_for_fp))])
            if min(np.bincount(yp.astype(int))) >= cv_splits:
                # Use BayesSearchCV if enough samples exist
                opt_rfp = BayesSearchCV(RandomForestClassifier(random_state=42), {'n_estimators': Integer(10, 200), 'max_depth': Integer(3, 50), 'min_samples_split': Integer(2, 10)}, n_iter=20, cv=StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42), scoring='f1', n_jobs=-1, random_state=42)
                opt_rfp.fit(Xp, yp)
                rfp = opt_rfp.best_estimator_
            else:
                rfp = RandomForestClassifier(random_state=42).fit(Xp, yp)

    if len(X_fn_train) > 0:
        X_normals_for_fn = X_train[y_train == 0]
        if len(X_normals_for_fn) > 0:
            normal_samples_for_fn = X_normals_for_fn[np.random.choice(len(X_normals_for_fn), size=len(X_fn_train), replace=True)]
            Xn = np.concatenate([X_fn_train, normal_samples_for_fn])
            yn = np.concatenate([np.ones(len(X_fn_train)), np.zeros(len(normal_samples_for_fn))])
            if min(np.bincount(yn.astype(int))) >= cv_splits:
                # Use BayesSearchCV if enough samples exist
                opt_rfn = BayesSearchCV(RandomForestClassifier(random_state=42), {'n_estimators': Integer(10, 200), 'max_depth': Integer(3, 50), 'min_samples_split': Integer(2, 10)}, n_iter=20, cv=StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42), scoring='f1', n_jobs=-1, random_state=42)
                opt_rfn.fit(Xn, yn)
                rfn = opt_rfn.best_estimator_
            else:
                rfn = RandomForestClassifier(random_state=42).fit(Xn, yn)


    y_final = y_pred_ae.copy()
    print("\n--- Applying Targeted Correction ---")

    if rfn is not None:
        rfn_check_indices = np.where(
            (y_pred_ae == 0) &
            (test_anomaly_scores > optimal_threshold - uncertainty_margin)
        )[0]
        print(f"Found {len(rfn_check_indices)} uncertain 'Normal' samples to re-classify with RFN.")
        if len(rfn_check_indices) > 0:
            rfn_predictions = rfn.predict(X_test[rfn_check_indices])
            y_final[rfn_check_indices] = rfn_predictions

    if rfp is not None:
        rfp_check_indices = np.where(
            (y_pred_ae == 1) &
            (test_anomaly_scores < optimal_threshold + uncertainty_margin)
        )[0]
        print(f"Found {len(rfp_check_indices)} uncertain 'Attack' samples to re-classify with RFP.")
        if len(rfp_check_indices) > 0:
            rfp_predictions = rfp.predict(X_test[rfp_check_indices])
            y_final[rfp_check_indices] = rfp_predictions

    print(classification_report(y_test, y_final, target_names=['Normal', 'Attack']))
    cm_final = confusion_matrix(y_test, y_final)

    tn_f, fp_f, fn_f, tp_f = cm_final.ravel()
    acc_final = accuracy_score(y_test, y_final)
    dr_final = tp_f / (tp_f + fn_f) if (tp_f + fn_f) > 0 else 0
    far_final = fp_f / (fp_f + tn_f) if (fp_f + tn_f) > 0 else 0

    print(f"  Acc: {acc_final:.4f}, DR: {dr_final:.4f}, FAR: {far_final:.4f}\n  CM:\n{cm_final}")

    return acc_final, dr_final, far_final, cm_final



In [126]:
pd.Series(y_val).value_counts()

Unnamed: 0,count
0,10108
1,10108


In [127]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score, recall_score, roc_curve

from sklearn.model_selection import train_test_split

def Anomaly_IDS(best_ae, X_train, y_train, X_test, y_test, X_val, y_val, num_epochs=50, batch_size=256):
    X_train_benign = X_train[y_train == 0]
    X_train_benign_torch = torch.tensor(X_train_benign, dtype=torch.float32)
    ae = best_ae
    _ = ae.fit_transform(X_train_benign_torch, num_epochs, batch_size)

    def get_autoencoder_anomaly_scores(model, X):
        model.eval()
        with torch.no_grad():
            X_torch = torch.tensor(X, dtype=torch.float32).to(device)
            _, reconstructed_X = model(X_torch)
            anomaly_scores = torch.mean(torch.pow(X_torch - reconstructed_X, 2), axis=1)
            return anomaly_scores.detach().cpu().numpy()

    X_val_train, X_val_tune, y_val_train, y_val_tune = train_test_split(
        X_val, y_val, test_size=0.4, random_state=42, stratify=y_val
    )

    val_train_scores = get_autoencoder_anomaly_scores(ae, X_val_train)
    fpr, tpr, thresholds = roc_curve(y_val_train, val_train_scores)

    df_roc = pd.DataFrame({'tpr': tpr, 'fpr': fpr, 'thresholds': thresholds})
    df_roc['youden_index'] = df_roc['tpr'] - df_roc['fpr']
    # Handle cases where max index might not be unique or thresholds are infinite
    df_roc.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_roc.dropna(inplace=True)
    optimal_threshold = df_roc.loc[df_roc['youden_index'].idxmax()]['thresholds']

    fp_idx = np.where((val_train_scores >= optimal_threshold) & (y_val_train == 0))[0]
    fn_idx = np.where((val_train_scores < optimal_threshold) & (y_val_train == 1))[0]

    X_fp_train = X_val_train[fp_idx]
    X_fn_train = X_val_train[fn_idx]

    # (The logic for training rfp and rfn remains identical, just using the new data)
    rfp, rfn = train_specialist_classifiers(X_fp_train, X_fn_train, X_train, y_train)

    # --- Step 2: Tune the uncertainty_margin on the val_tune set ---
    print("\nTuning the uncertainty margin...")
    # Define candidate margins as a percentage of the optimal threshold
    margin_candidates_relative = np.linspace(0.01, 0.50, 10) # e.g., 1% to 50%
    margin_candidates = [relative * optimal_threshold for relative in margin_candidates_relative]

    best_margin = 0
    best_f1 = -1

    val_tune_scores = get_autoencoder_anomaly_scores(ae, X_val_tune)
    y_pred_ae_tune = (val_tune_scores >= optimal_threshold).astype(int)

    for margin in margin_candidates:
        y_final_tune = apply_correction(
            y_pred_ae_tune, val_tune_scores, X_val_tune, rfp, rfn, optimal_threshold, margin
        )
        current_f1 = f1_score(y_val_tune, y_final_tune)

        if current_f1 > best_f1:
            best_f1 = current_f1
            best_margin = margin

    print(f"Optimal uncertainty margin found: {best_margin:.6f} (Resulting F1 on tune set: {best_f1:.4f})")

    # --- Step 3: Final evaluation on the unseen test set using the best margin ---
    print("\n--- Final Evaluation on Test Set ---")
    test_anomaly_scores = get_autoencoder_anomaly_scores(ae, X_test)
    y_pred_ae_test = (test_anomaly_scores >= optimal_threshold).astype(int)

    print("\nAutoencoder Performance (baseline):")
    print_metrics(y_test, y_pred_ae_test)

    print("\n--- Applying Targeted Correction with Tuned Margin ---")
    y_final_test = apply_correction(
        y_pred_ae_test, test_anomaly_scores, X_test, rfp, rfn, optimal_threshold, best_margin
    )

    print("\nFinal Hybrid Model Performance:")
    tn_f, fp_f, fn_f, tp_f, acc_final, dr_final, far_final, cm_final = print_metrics(y_test, y_final_test, return_raw=True)

    return acc_final, dr_final, far_final, cm_final

# --- Helper function to avoid code repetition ---
def train_specialist_classifiers(X_fp_train, X_fn_train, X_train, y_train):
    rfp = None
    rfn = None
    cv_splits = 5

    # RFP Training Logic (moved here from the main function)
    if len(X_fp_train) > 0:
        X_attacks_for_fp = X_train[y_train == 1]
        if len(X_attacks_for_fp) > 0:
            attack_samples_for_fp = X_attacks_for_fp[np.random.choice(len(X_attacks_for_fp), size=len(X_fp_train), replace=True)]
            Xp = np.concatenate([X_fp_train, attack_samples_for_fp])
            yp = np.concatenate([np.zeros(len(X_fp_train)), np.ones(len(attack_samples_for_fp))])
            if min(np.bincount(yp.astype(int))) >= cv_splits:
                opt_rfp = BayesSearchCV(RandomForestClassifier(random_state=42), {'n_estimators': Integer(10, 200), 'max_depth': Integer(3, 50), 'min_samples_split': Integer(2, 10)}, n_iter=20, cv=StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42), scoring='f1', n_jobs=-1, random_state=42)
                opt_rfp.fit(Xp, yp)
                rfp = opt_rfp.best_estimator_
            else:
                rfp = RandomForestClassifier(random_state=42).fit(Xp, yp)

    # RFN Training Logic (moved here from the main function)
    if len(X_fn_train) > 0:
        X_normals_for_fn = X_train[y_train == 0]
        if len(X_normals_for_fn) > 0:
            normal_samples_for_fn = X_normals_for_fn[np.random.choice(len(X_normals_for_fn), size=len(X_fn_train), replace=True)]
            Xn = np.concatenate([X_fn_train, normal_samples_for_fn])
            yn = np.concatenate([np.ones(len(X_fn_train)), np.zeros(len(normal_samples_for_fn))])
            if min(np.bincount(yn.astype(int))) >= cv_splits:
                opt_rfn = BayesSearchCV(RandomForestClassifier(random_state=42), {'n_estimators': Integer(10, 200), 'max_depth': Integer(3, 50), 'min_samples_split': Integer(2, 10)}, n_iter=20, cv=StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42), scoring='f1', n_jobs=-1, random_state=42)
                opt_rfn.fit(Xn, yn)
                rfn = opt_rfn.best_estimator_
            else:
                rfn = RandomForestClassifier(random_state=42).fit(Xn, yn)

    return rfp, rfn

def apply_correction(y_pred_initial, anomaly_scores, X_data, rfp, rfn, threshold, margin):
    y_final = y_pred_initial.copy()

    # Apply RFN to uncertain 'Normal' samples
    if rfn is not None:
        rfn_indices = np.where((y_pred_initial == 0) & (anomaly_scores > threshold - margin))[0]
        if len(rfn_indices) > 0:
            rfn_preds = rfn.predict(X_data[rfn_indices])
            y_final[rfn_indices] = rfn_preds

    # Apply RFP to uncertain 'Attack' samples
    if rfp is not None:
        rfp_indices = np.where((y_pred_initial == 1) & (anomaly_scores < threshold + margin))[0]
        if len(rfp_indices) > 0:
            rfp_preds = rfp.predict(X_data[rfp_indices])
            y_final[rfp_indices] = rfp_preds

    return y_final

def print_metrics(y_true, y_pred, return_raw=False):
    print(classification_report(y_true, y_pred, target_names=['Normal', 'Attack']))
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    acc = accuracy_score(y_true, y_pred)
    dr = recall_score(y_true, y_pred) # Detection Rate is just recall
    far = fp / (fp + tn) if (fp + tn) > 0 else 0
    print(f"  Acc: {acc:.4f}, DR: {dr:.4f}, FAR: {far:.4f}\n  CM:\n{cm}")
    if return_raw:
        return tn, fp, fn, tp, acc, dr, far, cm

In [128]:
Anomaly_IDS(
    best_ae=best_ae,
    X_train=X_train,   # convert to numpy
    X_test=X_test,
    y_train=y_train,   # convert to numpy
    y_test=y_test,
    X_val=X_val,
    y_val=y_val
)

Epoch#1: Train Average Loss = 0.00136
Epoch#2: Train Average Loss = 0.00133
Epoch#3: Train Average Loss = 0.00131
Epoch#4: Train Average Loss = 0.00128
Epoch#5: Train Average Loss = 0.00126
Epoch#6: Train Average Loss = 0.00123
Epoch#7: Train Average Loss = 0.00120
Epoch#8: Train Average Loss = 0.00118
Epoch#9: Train Average Loss = 0.00115
Epoch#10: Train Average Loss = 0.00108
Epoch#11: Train Average Loss = 0.00103
Epoch#12: Train Average Loss = 0.00101
Epoch#13: Train Average Loss = 0.00098
Epoch#14: Train Average Loss = 0.00095
Epoch#15: Train Average Loss = 0.00093
Epoch#16: Train Average Loss = 0.00091
Epoch#17: Train Average Loss = 0.00089
Epoch#18: Train Average Loss = 0.00087
Epoch#19: Train Average Loss = 0.00086
Epoch#20: Train Average Loss = 0.00084
Epoch#21: Train Average Loss = 0.00084
Epoch#22: Train Average Loss = 0.00082
Epoch#23: Train Average Loss = 0.00081
Epoch#24: Train Average Loss = 0.00080
Epoch#25: Train Average Loss = 0.00080
Epoch#26: Train Average Loss = 0.0

(0.847457627118644,
 1.0,
 np.float64(0.3050847457627119),
 array([[ 82,  36],
        [  0, 118]]))