## Import libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance

## Anomaly-based IDS - Infiltration

### Generate the port-scan datasets for unknown attack detection

In [58]:
df=pd.read_csv('CICIDS2017_sample_km.csv')

In [59]:
df

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,2.565000,-0.439152,-0.005817,-0.009418,-0.040156,-0.007139,-0.211382,-0.210703,-0.153699,-0.164849,...,0.002545,-0.125734,-0.104565,-0.149326,-0.101016,-0.351926,-0.109460,-0.356868,-0.338993,0
1,2.759600,-0.439341,-0.009819,-0.009418,-0.051263,-0.007139,-0.246240,-0.210703,-0.213347,-0.182201,...,0.002545,-0.125734,-0.104565,-0.149326,-0.101016,-0.351926,-0.109460,-0.356868,-0.338993,0
2,2.725909,-0.439340,-0.009819,-0.009418,-0.051863,-0.007139,-0.246240,-0.310140,-0.229468,-0.167112,...,0.002556,-0.125734,-0.104565,-0.149326,-0.101016,-0.351926,-0.109460,-0.356868,-0.338993,0
3,2.967710,-0.439345,-0.009819,-0.010421,-0.051263,-0.007142,-0.246240,-0.210703,-0.213347,-0.182201,...,0.002545,-0.125734,-0.104565,-0.149326,-0.101016,-0.351926,-0.109460,-0.356868,-0.338993,0
4,2.683412,-0.430518,-0.008485,-0.008416,-0.045960,-0.007137,-0.180706,-0.210703,-0.151549,-0.097234,...,0.002545,-0.125734,-0.104565,-0.149326,-0.101016,-0.351926,-0.109460,-0.356868,-0.338993,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25485,-0.417175,2.529503,0.022195,0.015647,0.612863,-0.007073,1.333548,-0.210703,1.066630,1.056629,...,0.002545,0.847478,0.135514,0.572316,0.898203,1.023440,4.126922,1.900294,0.555568,9
25486,-0.417175,2.871035,0.015525,0.010634,0.403729,-0.007086,0.693545,-0.210703,0.860244,0.661178,...,0.002545,0.773475,-0.099716,0.420533,0.907276,1.967152,-0.109145,1.892086,2.006554,9
25487,-0.417175,1.922710,0.042204,0.028681,1.412575,-0.007038,1.746272,-0.210703,1.609455,1.434458,...,0.002545,4.152361,7.829669,4.707039,0.882819,1.201179,5.539049,1.904398,0.444283,9
25488,-0.417175,1.000343,-0.003150,-0.002400,0.141060,-0.007121,1.053285,-0.210703,1.191111,1.068873,...,0.002545,-0.125734,-0.104565,-0.149326,-0.101016,-0.351926,-0.109460,-0.356868,-0.338993,9


In [60]:
df.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,10226
4,4619
10,3178
2,2564
1,1966
12,1507
14,652
3,208
7,155
6,118


In [61]:
df1 = df[df['Label'] != 13]
df1['Label'][df1['Label'] > 0] = 1
df1.to_csv('./CICIDS2017_sample_km_without_infiltration.csv',index=0)

In [62]:
df2 = df[df['Label'] == 13]
df2['Label'][df2['Label'] == 13] = 1
df2.to_csv('./CICIDS2017_sample_km_infiltration.csv',index=0)

### Read the generated datasets for unknown attack detection

In [63]:
df1 = pd.read_csv('./CICIDS2017_sample_km_without_infiltration.csv')
df2 = pd.read_csv('./CICIDS2017_sample_km_infiltration.csv')

In [64]:
features = df1.drop(['Label'],axis=1).dtypes[df1.dtypes != 'object'].index
df1[features] = df1[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
df2[features] = df2[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
df1.replace([np.inf, -np.inf], np.nan, inplace=True)
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
df1 = df1.fillna(0)
df2 = df2.fillna(0)

In [65]:
df1.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,15243
0,10226


In [66]:
df2.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,21


In [67]:
df2p=df1[df1['Label']==0]
df2pp=df2p.sample(n=None, frac=21/10226, replace=False, weights=None, random_state=None, axis=0)
df2=pd.concat([df2, df2pp])

In [68]:
df2.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,21
0,21


In [69]:
df = pd.concat([df1, df2])

In [70]:
df

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3.164706,-0.506792,-0.009495,-0.011584,-0.033575,-0.009654,-0.149953,-0.083295,-0.081770,-0.149710,...,-1.001204,-0.122805,-0.080004,-0.133844,-0.108046,-0.433087,-0.130463,-0.440349,-0.417809,0
1,3.395814,-0.506970,-0.013399,-0.011584,-0.037043,-0.009654,-0.180545,-0.083295,-0.132827,-0.168939,...,-1.001204,-0.122805,-0.080004,-0.133844,-0.108046,-0.433087,-0.130463,-0.440349,-0.417809,0
2,3.355802,-0.506969,-0.013399,-0.011584,-0.037231,-0.009654,-0.180545,-0.198613,-0.146627,-0.152217,...,0.782738,-0.122805,-0.080004,-0.133844,-0.108046,-0.433087,-0.130463,-0.440349,-0.417809,0
3,3.642967,-0.506974,-0.013399,-0.012590,-0.037043,-0.009657,-0.180545,-0.083295,-0.132827,-0.168939,...,-1.001204,-0.122805,-0.080004,-0.133844,-0.108046,-0.433087,-0.130463,-0.440349,-0.417809,0
4,3.305333,-0.498659,-0.012097,-0.010578,-0.035387,-0.009652,-0.123031,-0.083295,-0.079930,-0.074776,...,-1.001204,-0.122805,-0.080004,-0.133844,-0.108046,-0.433087,-0.130463,-0.440349,-0.417809,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,-0.400592,2.738774,0.004819,0.001492,-0.024703,-0.008845,0.310163,-0.198613,-0.093729,0.187045,...,0.782738,-0.075137,-0.055397,-0.070596,-0.059210,-0.108767,-0.129298,-0.124510,-0.091111,0
4626,-0.402345,2.343812,-0.013399,-0.010578,-0.034356,-0.009439,-0.135268,0.858465,0.064964,-0.202381,...,-1.001204,-0.075080,-0.080004,-0.100102,-0.054086,2.874971,-0.130463,2.781210,2.915504,0
3984,2.787709,-0.506975,-0.013399,-0.012590,-0.038074,-0.009657,-0.216033,-0.160173,-0.208724,-0.238611,...,-0.406557,-0.122805,-0.080004,-0.133844,-0.108046,-0.433087,-0.130463,-0.440349,-0.417809,0
8263,-0.402345,-0.506107,-0.014700,-0.011584,-0.036856,-0.009565,-0.165861,0.627830,-0.020132,-0.238611,...,-1.001204,-0.122805,-0.080004,-0.133844,-0.108046,-0.433087,-0.130463,-0.440349,-0.417809,0


In [71]:
X = df.drop(['Label'],axis=1) .values
y = df.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)
pd.Series(y).value_counts()

Unnamed: 0,count
1,15264
0,10247


### Feature engineering (IG, FCBF, and KPCA)

#### Feature selection by information gain (IG)

In [72]:
from sklearn.feature_selection import mutual_info_classif
importances = mutual_info_classif(X, y)

In [73]:
# calculate the sum of importance scores
f_list = sorted(zip(map(lambda x: round(x, 4), importances), features), reverse=True)
Sum = 0
fs = []
for i in range(0, len(f_list)):
    Sum = Sum + f_list[i][0]
    fs.append(f_list[i][1])

In [74]:
# select the important features from top to bottom until the accumulated importance reaches 90%
f_list2 = sorted(zip(map(lambda x: round(x, 4), importances/Sum), features), reverse=True)
Sum2 = 0
fs = []
for i in range(0, len(f_list2)):
    Sum2 = Sum2 + f_list2[i][0]
    fs.append(f_list2[i][1])
    if Sum2>=0.9:
        break

In [75]:
X_fs = df[fs].values

In [76]:
X_fs.shape

(25511, 44)

In [77]:
X_fs

array([[ 3.16470567, -0.4043448 , -0.51435501, ..., -0.45291352,
        -0.0094948 , -0.14970957],
       [ 3.39581441, -0.40449075, -0.52487493, ..., -0.45310661,
        -0.01339851, -0.16893868],
       [ 3.35580233, -0.4044654 , -0.52262861, ..., -0.45310661,
        -0.01339851, -0.15221734],
       ...,
       [ 2.78770872, -0.40458027, -0.54179206, ..., -0.45310661,
        -0.01339851, -0.23861096],
       [-0.40234537, -0.40100473, -0.43487583, ..., -0.45310661,
        -0.01469974, -0.23861096],
       [ 0.88037959, -0.40458027, -0.54179206, ..., -0.45310661,
        -0.01339851, -0.23861096]])

#### Feature selection by Fast Correlation Based Filter (FCBF)

The module is imported from the GitHub repo: https://github.com/SantiagoEG/FCBF_module

In [35]:
!pip install scikit-optimizer

Collecting scikit-optimizer
  Downloading scikit_optimizer-0.9.1-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting pyaml>=16.9 (from scikit-optimizer)
  Downloading pyaml-25.7.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimizer-0.9.1-py2.py3-none-any.whl (97 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/97.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.7.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimizer
Successfully installed pyaml-25.7.0 scikit-optimizer-0.9.1


In [78]:
from skopt import gp_minimize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

def fcbf_objective(threshold, X, y, clf=RandomForestClassifier(random_state=42)):
    selector = FCBF(th=threshold)
    X_selected = selector.fit_transform(X, y)
    if X_selected.shape[1] == 0:
        return 1.0
    score = cross_val_score(clf, X_selected, y, cv=3, scoring="accuracy").mean()
    return -score

In [79]:
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
fcbf = FCBF(th = 0.09988304703442026)

In [80]:
X_fss = fcbf.fit_transform(X_fs,y)

In [81]:
X_fss.shape

(25511, 1)

In [82]:
X_fss

array([[ 3.16470567],
       [ 3.39581441],
       [ 3.35580233],
       ...,
       [ 2.78770872],
       [-0.40234537],
       [ 0.88037959]])

####  kernel principal component analysis (KPCA)

In [None]:
from skopt import gp_minimize
from skopt.space import Integer, Categorical
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def kpca_objective(params, X, y):
    n_components, kernel = params
    n_components = min(n_components, X.shape[1] - 1)
    if n_components < 1: n_components = 1
    try:
        kpca = KernelPCA(n_components=n_components, kernel=kernel, fit_inverse_transform=False, random_state=42)
        X_kpca = kpca.fit_transform(X)
        clf = RandomForestClassifier(random_state=42)
        score = cross_val_score(clf, X_kpca, y, cv=3, scoring="accuracy").mean()
        return -score
    except Exception as e:
        return 1.0

search_space = [
    Integer(10, 50),
    Categorical(['rbf', 'poly'])
]

result = gp_minimize(
    lambda params: kpca_objective(params, X_fss, y),
    search_space,
    n_calls=20,
    random_state=42,
    acq_func='EI'
)

best_n_components, best_kernel = result.x
print(f"Best n_components: {best_n_components}, Best kernel: {best_kernel}")
print(f"Best accuracy: {-result.fun:.4f}")

In [83]:
from skopt import gp_minimize
from skopt.space import Integer, Categorical
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

kpca = KernelPCA(n_components=20, kernel='rbf', random_state=42)
X_kpca = kpca.fit_transform(X_fss)

In [84]:
X_train = X_kpca[:len(df1)]
y_train = y[:len(df1)]
X_test = X_kpca[len(df1):]
y_test = y[len(df1):]

In [85]:
pd.Series(y_train).value_counts()

Unnamed: 0,count
1,15243
0,10226


In [86]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy={0:15253})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [87]:
pd.Series(y_train).value_counts()

Unnamed: 0,count
0,15253
1,15243


In [88]:
pd.Series(y_test).value_counts()

Unnamed: 0,count
1,21
0,21


### Apply the cluster labeling (CL) k-means method

In [47]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN,MeanShift
from sklearn.cluster import SpectralClustering,AgglomerativeClustering,AffinityPropagation,Birch,MiniBatchKMeans,MeanShift
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics import classification_report
from sklearn import metrics

In [48]:
def CL_kmeans(X_train, X_test, y_train, y_test,n,b=100):
    km_cluster = MiniBatchKMeans(n_clusters=n,batch_size=b)
    result = km_cluster.fit_predict(X_train)
    result2 = km_cluster.predict(X_test)

    count=0
    a=np.zeros(n)
    b=np.zeros(n)
    for v in range(0,n):
        for i in range(0,len(y_train)):
            if result[i]==v:
                if y_train[i]==1:
                    a[v]=a[v]+1
                else:
                    b[v]=b[v]+1
    list1=[]
    list2=[]
    for v in range(0,n):
        if a[v]<=b[v]:
            list1.append(v)
        else:
            list2.append(v)
    for v in range(0,len(y_test)):
        if result2[v] in list1:
            result2[v]=0
        elif result2[v] in list2:
            result2[v]=1
        else:
            print("-1")
    print(classification_report(y_test, result2))
    cm=confusion_matrix(y_test,result2)
    acc=metrics.accuracy_score(y_test,result2)
    print(str(acc))
    print(cm)

In [89]:
X_train = X_kpca[:len(df1)]
y_train = y[:len(df1)]
X_test = X_kpca[len(df1):]
y_test = y[len(df1):]
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy={0:15253})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [90]:
CL_kmeans(X_train, X_test, y_train, y_test, 8)

              precision    recall  f1-score   support

           0       1.00      0.90      0.95        21
           1       0.91      1.00      0.95        21

    accuracy                           0.95        42
   macro avg       0.96      0.95      0.95        42
weighted avg       0.96      0.95      0.95        42

0.9523809523809523
[[19  2]
 [ 0 21]]


### Hyperparameter optimization of CL-k-means
Tune "k"

In [91]:
#Hyperparameter optimization by BO-GP
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from sklearn import metrics

space  = [Integer(2, 50, name='n_clusters')]
@use_named_args(space)
def objective(**params):
    km_cluster = MiniBatchKMeans(batch_size=100, **params)
    n=params['n_clusters']

    result = km_cluster.fit_predict(X_train)
    result2 = km_cluster.predict(X_val)

    count=0
    a=np.zeros(n)
    b=np.zeros(n)
    for v in range(0,n):
        for i in range(0,len(y_train)):
            if result[i]==v:
                if y_train[i]==1:
                    a[v]=a[v]+1
                else:
                    b[v]=b[v]+1
    list1=[]
    list2=[]
    for v in range(0,n):
        if a[v]<=b[v]:
            list1.append(v)
        else:
            list2.append(v)
    for v in range(0,len(y_val)):
        if result2[v] in list1:
            result2[v]=0
        elif result2[v] in list2:
            result2[v]=1
        else:
            print("-1")
    cm=metrics.accuracy_score(y_val,result2)
    print(str(n)+" "+str(cm))
    return (1-cm)
from skopt import gp_minimize
import time
t1=time.time()
res_gp = gp_minimize(objective, space, n_calls=20, random_state=0)
t2=time.time()
print(t2-t1)
print("Best score=%.4f" % (1-res_gp.fun))
print("""Best parameters: n_clusters=%d""" % (res_gp.x[0]))

30 0.8890851982724775
43 0.9279544562230074
43 0.9267765999214762
43 0.9250098154691794
32 0.9216725559481743
20 0.8780918727915195
16 0.8747546132705143
5 0.49627012171181784
15 0.8557126030624264
25 0.9202983902630546
34 0.9214762465645858
50 0.9218688653317628
46 0.9267765999214762
40 0.9271692186886533
37 0.9248135060855909
48 0.9305064782096584
23 0.8871221044365921
47 0.9273655280722418
39 0.9261876717707106
27 0.892226148409894
6.189974784851074
Best score=0.9305
Best parameters: n_clusters=48


In [92]:
X_train = X_kpca[:len(df1)]
y_train = y[:len(df1)]
X_test = X_kpca[len(df1):]
y_test = y[len(df1):]
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy={0:15253})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [93]:
CL_kmeans(X_train, X_test, y_train, y_test, res_gp.x[0])

              precision    recall  f1-score   support

           0       1.00      0.71      0.83        21
           1       0.78      1.00      0.88        21

    accuracy                           0.86        42
   macro avg       0.89      0.86      0.85        42
weighted avg       0.89      0.86      0.85        42

0.8571428571428571
[[15  6]
 [ 0 21]]


### Apply the CL-k-means model with biased classifiers

95% of the code has been shared, and the remaining 5% is retained for future extension.  
Thank you for your interest and more details are in the paper.

In [55]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    recall_score
)
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.neighbors import NearestNeighbors
import numpy as np
from skopt import BayesSearchCV
from skopt.space import Integer
from imblearn.over_sampling import SMOTE

def Anomaly_IDS(X_train, X_test, y_train, y_test, n, b=100):
    km = MiniBatchKMeans(n_clusters=n, batch_size=b, random_state=0)
    train_labels = km.fit_predict(X_train)
    test_labels  = km.predict(X_test)

    counts_pos = np.zeros(n)
    counts_neg = np.zeros(n)
    for idx, c in enumerate(train_labels):
        if y_train[idx] == 1: counts_pos[c] += 1
        else:               counts_neg[c] += 1

    cluster_prob = {}
    normal_clusters = []
    attack_clusters = []
    for c in range(n):
        tot = counts_pos[c] + counts_neg[c]
        if counts_pos[c] > counts_neg[c]:
            attack_clusters.append(c)
            cluster_prob[c] = counts_pos[c] / tot if tot>0 else 0.0
        else:
            normal_clusters.append(c)
            cluster_prob[c] = counts_neg[c] / tot if tot>0 else 0.0

    y_km = np.array([1 if c in attack_clusters else 0 for c in test_labels])

    print("CL-k-means Performance:")
    print(classification_report(y_test, y_km))
    cm = confusion_matrix(y_test, y_km)
    tn, fp, fn, tp = cm.ravel()
    dr = tp/(tp+fn) if tp+fn>0 else 0
    far = fp/(fp+tn) if fp+tn>0 else 0
    print(f"  DR: {dr:.4f}, FAR: {far:.4f}\n  CM:\n{cm}\n")

    y_train_km = np.array([1 if c in attack_clusters else 0 for c in train_labels])
    fp_idx = np.where((y_train_km == 1) & (y_train == 0))[0]
    fn_idx = np.where((y_train_km == 0) & (y_train == 1))[0]

    X_fp = X_train[fp_idx]
    X_fn = X_train[fn_idx]
    X_norm = X_train[y_train == 0]
    X_adv  = X_train[y_train == 1]

    if len(X_fp) and len(X_adv):
        attack_samples_for_fp = X_adv[np.random.choice(len(X_adv), size=len(X_fp),replace=True)]
        Xp = np.concatenate([X_fp, attack_samples_for_fp])
        yp = np.concatenate([np.zeros(len(X_fp)), np.ones(len(X_fp))])
        yp = yp.astype(np.int64)
        y_counts = np.bincount(yp)
        print(y_counts)
        min_groups = np.min(y_counts)
        if min_groups >= 2:
          opt_rfp = BayesSearchCV(
              RandomForestClassifier(random_state=0),
              {
                  'n_estimators': Integer(10,200),
                  'max_depth':    Integer(3,50),
                  'min_samples_split': Integer(2,10)
              },
              n_iter=20,
              cv=StratifiedKFold(5, shuffle=True, random_state=0),
              scoring='f1',
              n_jobs=-1,
              random_state=0
          )
          opt_rfp.fit(Xp, yp)
          rfp = opt_rfp.best_estimator_
        else:
          rfp = RandomForestClassifier(random_state=0).fit(X_train, y_train)
    else:
        rfp = RandomForestClassifier(random_state=0).fit(X_train, y_train)

    if len(X_fn) and len(X_norm):
        normal_samples_for_fn = X_norm[np.random.choice(len(X_norm), size=len(X_fn),replace=True)]
        Xn = np.concatenate([X_fn, normal_samples_for_fn])
        yn = np.concatenate([np.zeros(len(X_fn)), np.ones(len(X_fn))])
        yn = yn.astype(np.int64)
        y_counts = np.bincount(yn)
        print(y_counts)
        min_groups = np.min(y_counts)

        if min_groups >= 2:
          opt_rfn = BayesSearchCV(
              RandomForestClassifier(random_state=0),
              {
                  'n_estimators': Integer(10,200),
                  'max_depth':    Integer(3,50),
                  'min_samples_split': Integer(2,10)
              },
              n_iter=20,
              cv=StratifiedKFold(5, shuffle=True, random_state=0),
              scoring='f1',
              n_jobs=-1,
              random_state=0
          )
          opt_rfn.fit(Xn, yn)
          rfn = opt_rfn.best_estimator_
        else:
          rfn = RandomForestClassifier(random_state=0).fit(X_train, y_train)  # fallback
    else:
        rfn = RandomForestClassifier(random_state=0).fit(X_train, y_train)  # fallback

    probs = np.array([cluster_prob.get(c,0.0) for c in test_labels])
    best_thr, best_rec = 0.5, 0.0
    for thr in np.linspace(0.5, 0.99, 50):
        y_tmp = y_km.copy()
        for i, p in enumerate(probs):
            if p < thr:
                if y_tmp[i] == 0:
                    y_tmp[i] = rfn.predict(X_test[i].reshape(1,-1))[0]
                else:
                    y_tmp[i] = rfp.predict(X_test[i].reshape(1,-1))[0]
        rec = recall_score(y_test, y_tmp)
        if rec > best_rec:
            best_rec, best_thr = rec, thr

    y_final = y_km.copy()
    for i, p in enumerate(probs):
        if p < best_thr:
            if y_final[i] == 0:
                y_final[i] = rfn.predict(X_test[i].reshape(1,-1))[0]
            else:
                y_final[i] = rfp.predict(X_test[i].reshape(1,-1))[0]


    print("MTH-IDS Performance:")
    print(classification_report(y_test, y_final))
    cm2 = confusion_matrix(y_test, y_final)

    tn, fp, fn, tp = cm2.ravel()
    dr2 = tp/(tp+fn) if tp+fn>0 else 0
    far2 = fp/(fp+tn) if fp+tn>0 else 0
    acc2 = accuracy_score(y_test, y_final)

    print(f"  Acc: {acc2:.4f}, DR: {dr2:.4f}, FAR: {far2:.4f}\n  CM:\n{cm2}")

    return acc2, dr2, far2, cm2

In [94]:
X_train = X_kpca[:len(df1)]
y_train = y[:len(df1)]
X_test = X_kpca[len(df1):]
y_test = y[len(df1):]
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy={0:15253})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [96]:
one_acc, one_dr, one_far, one_f1 = Anomaly_IDS(X_train, X_test, y_train, y_test, res_gp.x[0])

CL-k-means Performance:
              precision    recall  f1-score   support

           0       1.00      0.71      0.83        21
           1       0.78      1.00      0.88        21

    accuracy                           0.86        42
   macro avg       0.89      0.86      0.85        42
weighted avg       0.89      0.86      0.85        42

  DR: 1.0000, FAR: 0.2857
  CM:
[[15  6]
 [ 0 21]]

[1403 1403]
[752 752]
MTH-IDS Performance:
              precision    recall  f1-score   support

           0       1.00      0.71      0.83        21
           1       0.78      1.00      0.88        21

    accuracy                           0.86        42
   macro avg       0.89      0.86      0.85        42
weighted avg       0.89      0.86      0.85        42

  Acc: 0.8571, DR: 1.0000, FAR: 0.2857
  CM:
[[15  6]
 [ 0 21]]
