## Import libraries

In [57]:
import warnings
warnings.filterwarnings("ignore")

In [58]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance

## Anomaly-based IDS - Infiltration

### Generate the port-scan datasets for unknown attack detection

In [100]:
df=pd.read_csv('CICIDS2018_sample_km.csv')

In [101]:
df

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,-0.475925,-0.005346,-0.020438,-0.040784,-0.020339,-0.022177,-0.830722,-0.361301,-0.961988,-0.776052,...,0.187746,-0.070847,-0.057248,-0.085188,-0.058141,0.007993,-0.001875,0.002505,0.039759,6
1,-0.475925,-0.005361,-0.020438,-0.040784,-0.020339,-0.022177,-0.830722,-0.361301,-0.961988,-0.776052,...,0.187746,-0.070847,-0.057248,-0.085188,-0.058141,0.007993,-0.001875,0.002505,0.039759,6
2,-0.475925,-0.005399,-0.020438,-0.040784,-0.020339,-0.022177,-0.830722,-0.361301,-0.961988,-0.776052,...,0.187746,-0.070847,-0.057248,-0.085188,-0.058141,0.007750,-0.001875,0.002402,0.038739,6
3,-0.475925,-0.005334,-0.020438,-0.040784,-0.020339,-0.022177,-0.830722,-0.361301,-0.961988,-0.776052,...,0.187746,-0.070847,-0.057248,-0.085188,-0.058141,0.007993,-0.001875,0.002505,0.039759,6
4,-0.475925,-0.005453,-0.020438,-0.040784,-0.020339,-0.022177,-0.830722,-0.361301,-0.961988,-0.776052,...,0.187746,-0.070847,-0.057248,-0.085188,-0.058141,0.007750,-0.001875,0.002402,0.038739,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,1.944820,-0.018959,-0.020859,-0.035722,-0.019875,-0.021916,-0.704356,1.437948,-0.330935,-0.776052,...,-1.492618,-0.070847,-0.057248,-0.085188,-0.058141,-0.017585,-0.001875,-0.008251,-0.067285,0
36453,1.944820,-0.018959,-0.020859,-0.035722,-0.019875,-0.021916,-0.704356,1.437948,-0.330935,-0.776052,...,-1.492618,-0.070847,-0.057248,-0.085188,-0.058141,-0.017585,-0.001875,-0.008251,-0.067285,0
36454,1.944820,-0.018959,-0.020859,-0.035722,-0.019875,-0.021916,-0.704356,1.437948,-0.330935,-0.776052,...,-1.492618,-0.070847,-0.057248,-0.085188,-0.058141,-0.017585,-0.001875,-0.008251,-0.067285,0
36455,1.944820,-0.018959,-0.020859,-0.035722,-0.019885,-0.021920,-0.707164,1.397965,-0.344959,-0.776052,...,-1.492618,-0.070847,-0.057248,-0.085188,-0.058141,-0.017585,-0.001875,-0.008251,-0.067285,0


In [102]:
df.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,10663
10,9908
6,5748
4,1989
5,1730
8,1448
1,1440
12,1183
14,943
2,568


In [103]:
df1 = df[df['Label'] != 11]
df1['Label'][df1['Label'] > 0] = 1
df1.to_csv('./CICIDS2017_sample_km_without_infiltration.csv',index=0)

In [104]:
df2 = df[df['Label'] == 11]
df2['Label'][df2['Label'] == 11] = 1
df2.to_csv('./CICIDS2017_sample_km_infiltration.csv',index=0)

### Read the generated datasets for unknown attack detection

In [105]:
df1 = pd.read_csv('./CICIDS2017_sample_km_without_infiltration.csv')
df2 = pd.read_csv('./CICIDS2017_sample_km_infiltration.csv')

In [106]:
features = df1.drop(['Label'],axis=1).dtypes[df1.dtypes != 'object'].index
df1[features] = df1[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
df2[features] = df2[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
df1.replace([np.inf, -np.inf], np.nan, inplace=True)
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
df1 = df1.fillna(0)
df2 = df2.fillna(0)

In [107]:
df1.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,25741
0,10663


In [108]:
df2.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,53


In [109]:
df2p=df1[df1['Label']==0]
df2pp=df2p.sample(n=None, frac=53/10663, replace=False, weights=None, random_state=None, axis=0)
df2=pd.concat([df2, df2pp])

In [110]:
df2.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,53
0,53


In [111]:
df = pd.concat([df1, df2])

In [112]:
df

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,-0.382095,-0.517743,-0.217492,-0.074217,-0.218506,-0.030445,-0.736454,-0.226535,-0.863741,-0.718557,...,-0.360771,-0.362016,-0.287732,-0.360769,-0.348767,-0.125197,-0.393213,-0.236716,-0.044055,1
1,-0.382095,-0.517984,-0.217492,-0.074217,-0.218506,-0.030445,-0.736454,-0.226535,-0.863741,-0.718557,...,-0.360771,-0.362016,-0.287732,-0.360769,-0.348767,-0.125197,-0.393213,-0.236716,-0.044055,1
2,-0.382095,-0.518604,-0.217492,-0.074217,-0.218506,-0.030445,-0.736454,-0.226535,-0.863741,-0.718557,...,-0.360771,-0.362016,-0.287732,-0.360769,-0.348767,-0.128760,-0.393213,-0.239947,-0.047642,1
3,-0.382095,-0.517547,-0.217492,-0.074217,-0.218506,-0.030445,-0.736454,-0.226535,-0.863741,-0.718557,...,-0.360771,-0.362016,-0.287732,-0.360769,-0.348767,-0.125197,-0.393213,-0.236716,-0.044055,1
4,-0.382095,-0.519477,-0.217492,-0.074217,-0.218506,-0.030445,-0.736454,-0.226535,-0.863741,-0.718557,...,-0.360771,-0.362016,-0.287732,-0.360769,-0.348767,-0.128760,-0.393213,-0.239947,-0.047642,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29102,-0.382095,0.400702,-0.217492,-0.074217,-0.218506,-0.030445,-0.736454,-0.226535,-0.863741,-0.718557,...,-0.360771,-0.362016,-0.287732,-0.360769,-0.348767,-0.499219,-0.393213,-0.575951,-0.420696,0
27980,-0.382095,-0.632989,-0.217143,0.027306,-0.217134,-0.014517,1.739256,-0.226535,0.469364,1.094986,...,-0.360771,-0.362016,-0.287732,-0.360769,-0.348767,-0.499219,-0.393213,-0.575951,-0.420696,0
31580,-0.382095,-0.649741,-0.217026,0.056313,-0.216763,-0.013005,1.978961,-0.226535,0.466840,1.034918,...,-0.360771,-0.362016,-0.287732,-0.360769,-0.348767,-0.499219,-0.393213,-0.575951,-0.420696,0
36011,-0.382095,-0.665570,-0.217220,0.027306,-0.217139,-0.014517,1.739256,-0.226535,0.759852,1.239051,...,-0.360771,-0.362016,-0.287732,-0.360769,-0.348767,-0.499219,-0.393213,-0.575951,-0.420696,0


In [113]:
X = df.drop(['Label'],axis=1) .values
y = df.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)
pd.Series(y).value_counts()

Unnamed: 0,count
1,25794
0,10716


### Feature engineering (IG, FCBF, and KPCA)

#### Feature selection by information gain (IG)

In [114]:
from sklearn.feature_selection import mutual_info_classif
importances = mutual_info_classif(X, y)

In [115]:
# calculate the sum of importance scores
f_list = sorted(zip(map(lambda x: round(x, 4), importances), features), reverse=True)
Sum = 0
fs = []
for i in range(0, len(f_list)):
    Sum = Sum + f_list[i][0]
    fs.append(f_list[i][1])

In [116]:
# select the important features from top to bottom until the accumulated importance reaches 90%
f_list2 = sorted(zip(map(lambda x: round(x, 4), importances/Sum), features), reverse=True)
Sum2 = 0
fs = []
for i in range(0, len(f_list2)):
    Sum2 = Sum2 + f_list2[i][0]
    fs.append(f_list2[i][1])
    if Sum2>=0.9:
        break

In [117]:
X_fs = df[fs].values

In [118]:
X_fs.shape

(36510, 44)

In [119]:
X_fs

array([[-0.21850608, -0.21850608, -0.73645396, ..., -0.07421678,
        -0.07421678, -0.42309966],
       [-0.21850608, -0.21850608, -0.73645396, ..., -0.07421678,
        -0.07421678, -0.42309966],
       [-0.21850608, -0.21850608, -0.73645396, ..., -0.07421678,
        -0.07421678, -0.42309966],
       ...,
       [-0.21676287, -0.21676287,  1.97896112, ...,  0.05631277,
         0.05631277, -0.39139099],
       [-0.21713866, -0.21713866,  1.73925551, ...,  0.0273062 ,
         0.0273062 , -0.37500084],
       [-0.2164392 , -0.2164392 ,  1.4433689 , ...,  1.34710504,
         1.34710504,  0.10739189]])

#### Feature selection by Fast Correlation Based Filter (FCBF)

The module is imported from the GitHub repo: https://github.com/SantiagoEG/FCBF_module

In [33]:
!pip install scikit-optimizer

Collecting scikit-optimizer
  Downloading scikit_optimizer-0.9.1-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting pyaml>=16.9 (from scikit-optimizer)
  Downloading pyaml-25.7.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimizer-0.9.1-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 kB[0m [31m519.9 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.7.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimizer
Successfully installed pyaml-25.7.0 scikit-optimizer-0.9.1


In [120]:
from skopt import gp_minimize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

def fcbf_objective(threshold, X, y, clf=RandomForestClassifier(random_state=42)):
    selector = FCBF(th=threshold)
    X_selected = selector.fit_transform(X, y)
    if X_selected.shape[1] == 0:
        return 1.0
    score = cross_val_score(clf, X_selected, y, cv=3, scoring="accuracy").mean()
    return -score

In [121]:
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
fcbf = FCBF(th = 0.09988304703442026)

In [122]:
X_fss = fcbf.fit_transform(X_fs,y)

In [123]:
X_fss.shape

(36510, 1)

In [124]:
X_fss

array([[-0.68034711],
       [-0.68034711],
       [-0.68034711],
       ...,
       [ 1.82542056],
       [ 1.81266856],
       [ 2.35887915]])

####  kernel principal component analysis (KPCA)

In [None]:
from skopt import gp_minimize
from skopt.space import Integer, Categorical
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def kpca_objective(params, X, y):
    n_components, kernel = params
    n_components = min(n_components, X.shape[1] - 1)
    if n_components < 1: n_components = 1
    try:
        kpca = KernelPCA(n_components=n_components, kernel=kernel, fit_inverse_transform=False, random_state=42)
        X_kpca = kpca.fit_transform(X)
        clf = RandomForestClassifier(random_state=42)
        score = cross_val_score(clf, X_kpca, y, cv=3, scoring="accuracy").mean()
        return -score
    except Exception as e:
        return 1.0

search_space = [
    Integer(10, 50),
    Categorical(['rbf', 'poly'])
]

result = gp_minimize(
    lambda params: kpca_objective(params, X_fss, y),
    search_space,
    n_calls=20,
    random_state=42,
    acq_func='EI'
)

best_n_components, best_kernel = result.x
print(f"Best n_components: {best_n_components}, Best kernel: {best_kernel}")
print(f"Best accuracy: {-result.fun:.4f}")

In [125]:
from skopt import gp_minimize
from skopt.space import Integer, Categorical
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

kpca = KernelPCA(n_components=20, kernel='rbf', random_state=42)
X_kpca = kpca.fit_transform(X_fss)

In [126]:
X_train = X_kpca[:len(df1)]
y_train = y[:len(df1)]
X_test = X_kpca[len(df1):]
y_test = y[len(df1):]

In [127]:
pd.Series(y_train).value_counts()

Unnamed: 0,count
1,25741
0,10663


In [128]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy={0:25741})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [129]:
pd.Series(y_train).value_counts()

Unnamed: 0,count
1,25741
0,25741


In [130]:
pd.Series(y_test).value_counts()

Unnamed: 0,count
1,53
0,53


### Apply the cluster labeling (CL) k-means method

In [131]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN,MeanShift
from sklearn.cluster import SpectralClustering,AgglomerativeClustering,AffinityPropagation,Birch,MiniBatchKMeans,MeanShift
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics import classification_report
from sklearn import metrics

In [91]:
def CL_kmeans(X_train, X_test, y_train, y_test,n,b=100):
    km_cluster = MiniBatchKMeans(n_clusters=n,batch_size=b)
    result = km_cluster.fit_predict(X_train)
    result2 = km_cluster.predict(X_test)

    count=0
    a=np.zeros(n)
    b=np.zeros(n)
    for v in range(0,n):
        for i in range(0,len(y_train)):
            if result[i]==v:
                if y_train[i]==1:
                    a[v]=a[v]+1
                else:
                    b[v]=b[v]+1
    list1=[]
    list2=[]
    for v in range(0,n):
        if a[v]<=b[v]:
            list1.append(v)
        else:
            list2.append(v)
    for v in range(0,len(y_test)):
        if result2[v] in list1:
            result2[v]=0
        elif result2[v] in list2:
            result2[v]=1
        else:
            print("-1")
    print(classification_report(y_test, result2))
    cm=confusion_matrix(y_test,result2)
    acc=metrics.accuracy_score(y_test,result2)
    print(str(acc))
    print(cm)

In [132]:
X_train = X_kpca[:len(df1)]
y_train = y[:len(df1)]
X_test = X_kpca[len(df1):]
y_test = y[len(df1):]
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy={0:23805})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [133]:
CL_kmeans(X_train, X_test, y_train, y_test, 8)

              precision    recall  f1-score   support

           0       1.00      0.79      0.88        53
           1       0.83      1.00      0.91        53

    accuracy                           0.90       106
   macro avg       0.91      0.90      0.90       106
weighted avg       0.91      0.90      0.90       106

0.8962264150943396
[[42 11]
 [ 0 53]]


### Hyperparameter optimization of CL-k-means
Tune "k"

In [134]:
#Hyperparameter optimization by BO-GP
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from sklearn import metrics

space  = [Integer(2, 50, name='n_clusters')]
@use_named_args(space)
def objective(**params):
    km_cluster = MiniBatchKMeans(batch_size=100, **params)
    n=params['n_clusters']

    result = km_cluster.fit_predict(X_train)
    result2 = km_cluster.predict(X_val)

    count=0
    a=np.zeros(n)
    b=np.zeros(n)
    for v in range(0,n):
        for i in range(0,len(y_train)):
            if result[i]==v:
                if y_train[i]==1:
                    a[v]=a[v]+1
                else:
                    b[v]=b[v]+1
    list1=[]
    list2=[]
    for v in range(0,n):
        if a[v]<=b[v]:
            list1.append(v)
        else:
            list2.append(v)
    for v in range(0,len(y_val)):
        if result2[v] in list1:
            result2[v]=0
        elif result2[v] in list2:
            result2[v]=1
        else:
            print("-1")
    cm=metrics.accuracy_score(y_val,result2)
    print(str(n)+" "+str(cm))
    return (1-cm)
from skopt import gp_minimize
import time
t1=time.time()
res_gp = gp_minimize(objective, space, n_calls=20, random_state=0)
t2=time.time()
print(t2-t1)
print("Best score=%.4f" % (1-res_gp.fun))
print("""Best parameters: n_clusters=%d""" % (res_gp.x[0]))

30 0.8828457629446504
43 0.8956187336904271
43 0.8681499793984343
43 0.8821590440873507
32 0.8913610767751683
20 0.8798241999725313
16 0.8512566955088586
5 0.8042851256695509
15 0.8512566955088586
25 0.869111385798654
50 0.8645790413404753
35 0.865265760197775
50 0.8700727921988738
35 0.8930092020326879
35 0.8743304491141327
31 0.8908117016893284
31 0.8908117016893284
27 0.8858673259167696
32 0.8858673259167696
39 0.8921851394039281
7.049893140792847
Best score=0.8956
Best parameters: n_clusters=43


In [135]:
X_train = X_kpca[:len(df1)]
y_train = y[:len(df1)]
X_test = X_kpca[len(df1):]
y_test = y[len(df1):]
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy={0:23805})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [136]:
CL_kmeans(X_train, X_test, y_train, y_test, res_gp.x[0])

              precision    recall  f1-score   support

           0       1.00      0.83      0.91        53
           1       0.85      1.00      0.92        53

    accuracy                           0.92       106
   macro avg       0.93      0.92      0.91       106
weighted avg       0.93      0.92      0.91       106

0.9150943396226415
[[44  9]
 [ 0 53]]


### Apply the CL-k-means model with biased classifiers

95% of the code has been shared, and the remaining 5% is retained for future extension.  
Thank you for your interest and more details are in the paper.

In [137]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    recall_score
)
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.neighbors import NearestNeighbors
import numpy as np
from skopt import BayesSearchCV
from skopt.space import Integer
from imblearn.over_sampling import SMOTE

def Anomaly_IDS(X_train, X_test, y_train, y_test, n, b=100):
    km = MiniBatchKMeans(n_clusters=n, batch_size=b, random_state=0)
    train_labels = km.fit_predict(X_train)
    test_labels  = km.predict(X_test)

    counts_pos = np.zeros(n)
    counts_neg = np.zeros(n)
    for idx, c in enumerate(train_labels):
        if y_train[idx] == 1: counts_pos[c] += 1
        else:               counts_neg[c] += 1

    cluster_prob = {}
    normal_clusters = []
    attack_clusters = []
    for c in range(n):
        tot = counts_pos[c] + counts_neg[c]
        if counts_pos[c] > counts_neg[c]:
            attack_clusters.append(c)
            cluster_prob[c] = counts_pos[c] / tot if tot>0 else 0.0
        else:
            normal_clusters.append(c)
            cluster_prob[c] = counts_neg[c] / tot if tot>0 else 0.0

    y_km = np.array([1 if c in attack_clusters else 0 for c in test_labels])

    print("CL-k-means Performance:")
    print(classification_report(y_test, y_km))
    cm = confusion_matrix(y_test, y_km)
    tn, fp, fn, tp = cm.ravel()
    dr = tp/(tp+fn) if tp+fn>0 else 0
    far = fp/(fp+tn) if fp+tn>0 else 0
    print(f"  DR: {dr:.4f}, FAR: {far:.4f}\n  CM:\n{cm}\n")

    y_train_km = np.array([1 if c in attack_clusters else 0 for c in train_labels])
    fp_idx = np.where((y_train_km == 1) & (y_train == 0))[0]
    fn_idx = np.where((y_train_km == 0) & (y_train == 1))[0]

    X_fp = X_train[fp_idx]
    X_fn = X_train[fn_idx]
    X_norm = X_train[y_train == 0]
    X_adv  = X_train[y_train == 1]

    if len(X_fp) and len(X_adv):
        attack_samples_for_fp = X_adv[np.random.choice(len(X_adv), size=len(X_fp),replace=True)]
        Xp = np.concatenate([X_fp, attack_samples_for_fp])
        yp = np.concatenate([np.zeros(len(X_fp)), np.ones(len(X_fp))])
        yp = yp.astype(np.int64)
        y_counts = np.bincount(yp)
        print(y_counts)
        min_groups = np.min(y_counts)
        if min_groups >= 2:
          opt_rfp = BayesSearchCV(
              RandomForestClassifier(random_state=0),
              {
                  'n_estimators': Integer(10,200),
                  'max_depth':    Integer(3,50),
                  'min_samples_split': Integer(2,10)
              },
              n_iter=20,
              cv=StratifiedKFold(5, shuffle=True, random_state=0),
              scoring='f1',
              n_jobs=-1,
              random_state=0
          )
          opt_rfp.fit(Xp, yp)
          rfp = opt_rfp.best_estimator_
        else:
          rfp = RandomForestClassifier(random_state=0).fit(X_train, y_train)
    else:
        rfp = RandomForestClassifier(random_state=0).fit(X_train, y_train)

    if len(X_fn) and len(X_norm):
        normal_samples_for_fn = X_norm[np.random.choice(len(X_norm), size=len(X_fn),replace=True)]
        Xn = np.concatenate([X_fn, normal_samples_for_fn])
        yn = np.concatenate([np.zeros(len(X_fn)), np.ones(len(X_fn))])
        yn = yn.astype(np.int64)
        y_counts = np.bincount(yn)
        print(y_counts)
        min_groups = np.min(y_counts)

        if min_groups >= 2:
          opt_rfn = BayesSearchCV(
              RandomForestClassifier(random_state=0),
              {
                  'n_estimators': Integer(10,200),
                  'max_depth':    Integer(3,50),
                  'min_samples_split': Integer(2,10)
              },
              n_iter=20,
              cv=StratifiedKFold(5, shuffle=True, random_state=0),
              scoring='f1',
              n_jobs=-1,
              random_state=0
          )
          opt_rfn.fit(Xn, yn)
          rfn = opt_rfn.best_estimator_
        else:
          rfn = RandomForestClassifier(random_state=0).fit(X_train, y_train)  # fallback
    else:
        rfn = RandomForestClassifier(random_state=0).fit(X_train, y_train)  # fallback

    probs = np.array([cluster_prob.get(c,0.0) for c in test_labels])
    best_thr, best_rec = 0.5, 0.0
    for thr in np.linspace(0.5, 0.99, 50):
        y_tmp = y_km.copy()
        for i, p in enumerate(probs):
            if p < thr:
                if y_tmp[i] == 0:
                    y_tmp[i] = rfn.predict(X_test[i].reshape(1,-1))[0]
                else:
                    y_tmp[i] = rfp.predict(X_test[i].reshape(1,-1))[0]
        rec = recall_score(y_test, y_tmp)
        if rec > best_rec:
            best_rec, best_thr = rec, thr

    y_final = y_km.copy()
    for i, p in enumerate(probs):
        if p < best_thr:
            if y_final[i] == 0:
                y_final[i] = rfn.predict(X_test[i].reshape(1,-1))[0]
            else:
                y_final[i] = rfp.predict(X_test[i].reshape(1,-1))[0]


    print("MTH-IDS Performance:")
    print(classification_report(y_test, y_final))
    cm2 = confusion_matrix(y_test, y_final)

    tn, fp, fn, tp = cm2.ravel()
    dr2 = tp/(tp+fn) if tp+fn>0 else 0
    far2 = fp/(fp+tn) if fp+tn>0 else 0
    acc2 = accuracy_score(y_test, y_final)

    print(f"  Acc: {acc2:.4f}, DR: {dr2:.4f}, FAR: {far2:.4f}\n  CM:\n{cm2}")

    return acc2, dr2, far2, cm2

In [141]:
X_train = X_kpca[:len(df1)]
y_train = y[:len(df1)]
X_test = X_kpca[len(df1):]
y_test = y[len(df1):]
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy={1:25741})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [142]:
one_acc, one_dr, one_far, one_f1 = Anomaly_IDS(X_train, X_test, y_train, y_test, res_gp.x[0])

CL-k-means Performance:
              precision    recall  f1-score   support

           0       1.00      0.83      0.91        53
           1       0.85      1.00      0.92        53

    accuracy                           0.92       106
   macro avg       0.93      0.92      0.91       106
weighted avg       0.93      0.92      0.91       106

  DR: 1.0000, FAR: 0.1698
  CM:
[[44  9]
 [ 0 53]]

[2850 2850]
[1116 1116]
MTH-IDS Performance:
              precision    recall  f1-score   support

           0       1.00      0.83      0.91        53
           1       0.85      1.00      0.92        53

    accuracy                           0.92       106
   macro avg       0.93      0.92      0.91       106
weighted avg       0.93      0.92      0.91       106

  Acc: 0.9151, DR: 1.0000, FAR: 0.1698
  CM:
[[44  9]
 [ 0 53]]
