## Import libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_recall_fscore_support
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance

## Anomaly-based IDS - Infiltration

### Generate the port-scan datasets for unknown attack detection

In [3]:
df=pd.read_csv('CICIDS2018_sample_km.csv')

In [4]:
df

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,-0.475925,-0.005346,-0.020438,-0.040784,-0.020339,-0.022177,-0.830722,-0.361301,-0.961988,-0.776052,...,0.187746,-0.070847,-0.057248,-0.085188,-0.058141,0.007993,-0.001875,0.002505,0.039759,6
1,-0.475925,-0.005361,-0.020438,-0.040784,-0.020339,-0.022177,-0.830722,-0.361301,-0.961988,-0.776052,...,0.187746,-0.070847,-0.057248,-0.085188,-0.058141,0.007993,-0.001875,0.002505,0.039759,6
2,-0.475925,-0.005399,-0.020438,-0.040784,-0.020339,-0.022177,-0.830722,-0.361301,-0.961988,-0.776052,...,0.187746,-0.070847,-0.057248,-0.085188,-0.058141,0.007750,-0.001875,0.002402,0.038739,6
3,-0.475925,-0.005334,-0.020438,-0.040784,-0.020339,-0.022177,-0.830722,-0.361301,-0.961988,-0.776052,...,0.187746,-0.070847,-0.057248,-0.085188,-0.058141,0.007993,-0.001875,0.002505,0.039759,6
4,-0.475925,-0.005453,-0.020438,-0.040784,-0.020339,-0.022177,-0.830722,-0.361301,-0.961988,-0.776052,...,0.187746,-0.070847,-0.057248,-0.085188,-0.058141,0.007750,-0.001875,0.002402,0.038739,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,1.944820,-0.018959,-0.020859,-0.035722,-0.019875,-0.021916,-0.704356,1.437948,-0.330935,-0.776052,...,-1.492618,-0.070847,-0.057248,-0.085188,-0.058141,-0.017585,-0.001875,-0.008251,-0.067285,0
36453,1.944820,-0.018959,-0.020859,-0.035722,-0.019875,-0.021916,-0.704356,1.437948,-0.330935,-0.776052,...,-1.492618,-0.070847,-0.057248,-0.085188,-0.058141,-0.017585,-0.001875,-0.008251,-0.067285,0
36454,1.944820,-0.018959,-0.020859,-0.035722,-0.019875,-0.021916,-0.704356,1.437948,-0.330935,-0.776052,...,-1.492618,-0.070847,-0.057248,-0.085188,-0.058141,-0.017585,-0.001875,-0.008251,-0.067285,0
36455,1.944820,-0.018959,-0.020859,-0.035722,-0.019885,-0.021920,-0.707164,1.397965,-0.344959,-0.776052,...,-1.492618,-0.070847,-0.057248,-0.085188,-0.058141,-0.017585,-0.001875,-0.008251,-0.067285,0


In [5]:
df.Label.value_counts()

Label
0     10663
10     9908
6      5748
4      1989
5      1730
8      1448
1      1440
12     1183
14      943
2       568
7       415
3       229
13       85
9        55
11       53
Name: count, dtype: int64

In [6]:
df1 = df[df['Label'] != 8]
df1['Label'][df1['Label'] > 0] = 1
df1.to_csv('./CICIDS2017_sample_km_without_infiltration.csv',index=0)

In [7]:
df2 = df[df['Label'] == 8]
df2['Label'][df2['Label'] == 8] = 1
df2.to_csv('./CICIDS2017_sample_km_infiltration.csv',index=0)

### Read the generated datasets for unknown attack detection

In [8]:
df1 = pd.read_csv('./CICIDS2017_sample_km_without_infiltration.csv')
df2 = pd.read_csv('./CICIDS2017_sample_km_infiltration.csv')

In [9]:
features = df1.drop(['Label'],axis=1).dtypes[df1.dtypes != 'object'].index
df1[features] = df1[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
df2[features] = df2[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
df1.replace([np.inf, -np.inf], np.nan, inplace=True)
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
df1 = df1.fillna(0)
df2 = df2.fillna(0)

In [10]:
df1.Label.value_counts()

Label
1    24346
0    10663
Name: count, dtype: int64

In [11]:
df2.Label.value_counts()

Label
1    1448
Name: count, dtype: int64

In [12]:
df2p=df1[df1['Label']==0]
df2pp=df2p.sample(n=None, frac=1448/10663, replace=False, weights=None, random_state=None, axis=0)
df2=pd.concat([df2, df2pp])

In [13]:
df2.Label.value_counts()

Label
1    1448
0    1448
Name: count, dtype: int64

In [14]:
df = pd.concat([df1, df2])

In [15]:
df

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,-0.390772,-0.541530,-0.221991,-0.075262,-0.223026,-0.030980,-0.749238,-0.231241,-0.882992,-0.729331,...,-0.323348,-0.370122,-0.293894,-0.368841,-0.356509,-0.142145,-0.402211,-0.255587,-0.058958,1
1,-0.390772,-0.541770,-0.221991,-0.075262,-0.223026,-0.030980,-0.749238,-0.231241,-0.882992,-0.729331,...,-0.323348,-0.370122,-0.293894,-0.368841,-0.356509,-0.142145,-0.402211,-0.255587,-0.058958,1
2,-0.390772,-0.542384,-0.221991,-0.075262,-0.223026,-0.030980,-0.749238,-0.231241,-0.882992,-0.729331,...,-0.323348,-0.370122,-0.293894,-0.368841,-0.356509,-0.145657,-0.402211,-0.258777,-0.062491,1
3,-0.390772,-0.541337,-0.221991,-0.075262,-0.223026,-0.030980,-0.749238,-0.231241,-0.882992,-0.729331,...,-0.323348,-0.370122,-0.293894,-0.368841,-0.356509,-0.142145,-0.402211,-0.255587,-0.058958,1
4,-0.390772,-0.543250,-0.221991,-0.075262,-0.223026,-0.030980,-0.749238,-0.231241,-0.882992,-0.729331,...,-0.323348,-0.370122,-0.293894,-0.368841,-0.356509,-0.145657,-0.402211,-0.258777,-0.062491,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32495,-0.390772,-0.760294,-0.221953,-0.018367,-0.222834,-0.026158,-0.151991,-0.231241,-0.192110,0.116560,...,-0.323348,-0.370122,-0.293894,-0.368841,-0.356509,-0.510966,-0.402211,-0.590594,-0.429854,0
27324,-1.935808,1.589060,-0.221953,-0.075262,-0.223026,-0.030980,-0.749238,-0.231241,-0.882992,-0.729331,...,-2.523899,-0.370122,-0.293894,-0.368841,-0.356509,1.467301,-0.402198,1.206303,1.559539,0
33228,2.441795,-0.759891,-0.221991,-0.046815,-0.222931,-0.029557,-0.600853,0.858936,-0.368049,-0.729331,...,-1.643678,-0.370122,-0.293894,-0.368841,-0.356509,-0.510966,-0.402211,-0.590594,-0.429854,0
25178,-0.390772,-0.661105,-0.221572,0.038529,-0.221323,-0.013917,1.940228,-0.231241,0.534093,1.058576,...,-0.323348,-0.370122,-0.293894,-0.368841,-0.356509,-0.510966,-0.402211,-0.590594,-0.429854,0


In [16]:
X = df.drop(['Label'],axis=1) .values
y = df.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)
pd.Series(y).value_counts()

1    25794
0    12111
Name: count, dtype: int64

### Feature engineering (IG, FCBF, and KPCA)

#### Feature selection by information gain (IG)

In [17]:
from sklearn.feature_selection import mutual_info_classif
importances = mutual_info_classif(X, y)

In [18]:
# calculate the sum of importance scores
f_list = sorted(zip(map(lambda x: round(x, 4), importances), features), reverse=True)
Sum = 0
fs = []
for i in range(0, len(f_list)):
    Sum = Sum + f_list[i][0]
    fs.append(f_list[i][1])

In [19]:
# select the important features from top to bottom until the accumulated importance reaches 90%
f_list2 = sorted(zip(map(lambda x: round(x, 4), importances/Sum), features), reverse=True)
Sum2 = 0
fs = []
for i in range(0, len(f_list2)):
    Sum2 = Sum2 + f_list2[i][0]
    fs.append(f_list2[i][1])
    if Sum2>=0.9:
        break

In [20]:
X_fs = df[fs].values

In [21]:
X_fs.shape

(37905, 46)

In [22]:
X_fs

array([[-0.22302574, -0.22302574, -0.74923768, ..., -0.43199174,
        -0.25558696, -0.05895824],
       [-0.22302574, -0.22302574, -0.74923768, ..., -0.43199174,
        -0.25558696, -0.05895824],
       [-0.22302574, -0.22302574, -0.74923768, ..., -0.43199174,
        -0.2587775 , -0.06249058],
       ...,
       [-0.22293055, -0.22293055, -0.60085337, ..., -0.43199174,
        -0.5905936 , -0.42985402],
       [-0.22132296, -0.22132296,  1.94022799, ..., -0.39779894,
        -0.5905936 , -0.42985402],
       [-0.22191316, -0.22191316,  2.71924564, ..., -0.42561907,
        -0.41919154, -0.24008972]])

#### Feature selection by Fast Correlation Based Filter (FCBF)

The module is imported from the GitHub repo: https://github.com/SantiagoEG/FCBF_module

In [25]:
!pip install scikit-optimizer



In [23]:
from skopt import gp_minimize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

def fcbf_objective(threshold, X, y, clf=RandomForestClassifier(random_state=42)):
    selector = FCBF(th=threshold)
    X_selected = selector.fit_transform(X, y)
    if X_selected.shape[1] == 0:
        return 1.0
    score = cross_val_score(clf, X_selected, y, cv=3, scoring="accuracy").mean()
    return -score

In [27]:
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i

res = gp_minimize(
    lambda th: fcbf_objective(th[0], X_fs, y),
    dimensions=[(0.01, 0.5)],
    n_calls=20,
    random_state=42,
    acq_func='EI',
)

best_threshold = res.x[0]
print("Best threshold:", best_threshold)
print("Best accuracy:", -res.fun)

KeyboardInterrupt: 

In [24]:
from FCBF_module import FCBF, FCBFK, FCBFiP, get_i
fcbf = FCBF(th = 0.09988304703442026)

In [25]:
X_fss = fcbf.fit_transform(X_fs,y)

In [26]:
X_fss.shape

(37905, 1)

In [27]:
X_fss

array([[-0.68893634],
       [-0.68893634],
       [-0.68893634],
       ...,
       [-0.5368432 ],
       [ 1.7931392 ],
       [ 0.14123871]])

####  kernel principal component analysis (KPCA)

In [None]:
from skopt import gp_minimize
from skopt.space import Integer, Categorical
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def kpca_objective(params, X, y):
    n_components, kernel = params
    n_components = min(n_components, X.shape[1] - 1)
    if n_components < 1: n_components = 1
    try:
        kpca = KernelPCA(n_components=n_components, kernel=kernel, fit_inverse_transform=False, random_state=42)
        X_kpca = kpca.fit_transform(X)
        clf = RandomForestClassifier(random_state=42)
        score = cross_val_score(clf, X_kpca, y, cv=3, scoring="accuracy").mean()
        return -score
    except Exception as e:
        return 1.0

search_space = [
    Integer(10, 50),
    Categorical(['rbf', 'poly'])
]

result = gp_minimize(
    lambda params: kpca_objective(params, X_fss, y),
    search_space,
    n_calls=20,
    random_state=42,
    acq_func='EI'
)

best_n_components, best_kernel = result.x
print(f"Best n_components: {best_n_components}, Best kernel: {best_kernel}")
print(f"Best accuracy: {-result.fun:.4f}")

In [28]:
from skopt import gp_minimize
from skopt.space import Integer, Categorical
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

kpca = KernelPCA(n_components=10, kernel='rbf', random_state=42, fit_inverse_transform=False)
X_kpca = kpca.fit_transform(X_fss)

In [29]:
X_train = X_kpca[:len(df1)]
y_train = y[:len(df1)]
X_test = X_kpca[len(df1):]
y_test = y[len(df1):]

In [30]:
pd.Series(y_train).value_counts()

1    24346
0    10663
Name: count, dtype: int64

In [31]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy={0:24346})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [32]:
pd.Series(y_train).value_counts()

1    24346
0    24346
Name: count, dtype: int64

In [33]:
pd.Series(y_test).value_counts()

1    1448
0    1448
Name: count, dtype: int64

### Apply the cluster labeling (CL) k-means method

In [34]:
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN,MeanShift
from sklearn.cluster import SpectralClustering,AgglomerativeClustering,AffinityPropagation,Birch,MiniBatchKMeans,MeanShift
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics import classification_report
from sklearn import metrics

In [35]:
def CL_kmeans(X_train, X_test, y_train, y_test,n,b=100):
    km_cluster = MiniBatchKMeans(n_clusters=n,batch_size=b)
    result = km_cluster.fit_predict(X_train)
    result2 = km_cluster.predict(X_test)

    count=0
    a=np.zeros(n)
    b=np.zeros(n)
    for v in range(0,n):
        for i in range(0,len(y_train)):
            if result[i]==v:
                if y_train[i]==1:
                    a[v]=a[v]+1
                else:
                    b[v]=b[v]+1
    list1=[]
    list2=[]
    for v in range(0,n):
        if a[v]<=b[v]:
            list1.append(v)
        else:
            list2.append(v)
    for v in range(0,len(y_test)):
        if result2[v] in list1:
            result2[v]=0
        elif result2[v] in list2:
            result2[v]=1
        else:
            print("-1")
    print(classification_report(y_test, result2))
    cm=confusion_matrix(y_test,result2)
    acc=metrics.accuracy_score(y_test,result2)
    print(str(acc))
    print(cm)

In [36]:
X_train = X_kpca[:len(df1)]
y_train = y[:len(df1)]
X_test = X_kpca[len(df1):]
y_test = y[len(df1):]
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy={0:24346})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [37]:
CL_kmeans(X_train, X_test, y_train, y_test, 8)

              precision    recall  f1-score   support

           0       0.42      0.73      0.53      1448
           1       0.00      0.00      0.00      1448

    accuracy                           0.36      2896
   macro avg       0.21      0.36      0.27      2896
weighted avg       0.21      0.36      0.27      2896

0.36429558011049723
[[1055  393]
 [1448    0]]


### Hyperparameter optimization of CL-k-means
Tune "k"

In [38]:
#Hyperparameter optimization by BO-GP
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from sklearn import metrics

space  = [Integer(2, 50, name='n_clusters')]
@use_named_args(space)
def objective(**params):
    km_cluster = MiniBatchKMeans(batch_size=100, **params)
    n=params['n_clusters']

    result = km_cluster.fit_predict(X_train)
    result2 = km_cluster.predict(X_val)

    count=0
    a=np.zeros(n)
    b=np.zeros(n)
    for v in range(0,n):
        for i in range(0,len(y_train)):
            if result[i]==v:
                if y_train[i]==1:
                    a[v]=a[v]+1
                else:
                    b[v]=b[v]+1
    list1=[]
    list2=[]
    for v in range(0,n):
        if a[v]<=b[v]:
            list1.append(v)
        else:
            list2.append(v)
    for v in range(0,len(y_val)):
        if result2[v] in list1:
            result2[v]=0
        elif result2[v] in list2:
            result2[v]=1
        else:
            print("-1")
    cm=metrics.accuracy_score(y_val,result2)
    print(str(n)+" "+str(cm))
    return (1-cm)
from skopt import gp_minimize
import time
t1=time.time()
res_gp = gp_minimize(objective, space, n_calls=20, random_state=0)
t2=time.time()
print(t2-t1)
print("Best score=%.4f" % (1-res_gp.fun))
print("""Best parameters: n_clusters=%d""" % (res_gp.x[0]))

30 0.8813196229648672
43 0.8723221936589546
43 0.8851756640959726
43 0.8756069694373037
32 0.8893173379034561
20 0.8404741502427878
16 0.840759782919166
5 0.8260497000856898
15 0.8394744358754641
25 0.8798914595829763
35 0.8823193373321908
50 0.8696086832333619
35 0.8831762353613254
34 0.8886032562125107
37 0.8483290488431876
46 0.8691802342187946
27 0.8447586403884605
50 0.8827477863467581
50 0.8793201942302199
23 0.8264781491002571
38.34403467178345
Best score=0.8893
Best parameters: n_clusters=32


In [39]:
X_train = X_kpca[:len(df1)]
y_train = y[:len(df1)]
X_test = X_kpca[len(df1):]
y_test = y[len(df1):]
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy={0:24346})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [40]:
CL_kmeans(X_train, X_test, y_train, y_test, res_gp.x[0])

              precision    recall  f1-score   support

           0       0.44      0.78      0.56      1448
           1       0.00      0.00      0.00      1448

    accuracy                           0.39      2896
   macro avg       0.22      0.39      0.28      2896
weighted avg       0.22      0.39      0.28      2896

0.38950276243093923
[[1128  320]
 [1448    0]]


### Apply the CL-k-means model with biased classifiers

95% of the code has been shared, and the remaining 5% is retained for future extension.  
Thank you for your interest and more details are in the paper.

In [41]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    recall_score
)
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.neighbors import NearestNeighbors
import numpy as np
from skopt import BayesSearchCV
from skopt.space import Integer
from imblearn.over_sampling import SMOTE

def Anomaly_IDS(X_train, X_test, y_train, y_test, n, b=100):
    km = MiniBatchKMeans(n_clusters=n, batch_size=b, random_state=0)
    train_labels = km.fit_predict(X_train)
    test_labels  = km.predict(X_test)

    counts_pos = np.zeros(n)
    counts_neg = np.zeros(n)
    for idx, c in enumerate(train_labels):
        if y_train[idx] == 1: counts_pos[c] += 1
        else:               counts_neg[c] += 1

    cluster_prob = {}
    normal_clusters = []
    attack_clusters = []
    for c in range(n):
        tot = counts_pos[c] + counts_neg[c]
        if counts_pos[c] > counts_neg[c]:
            attack_clusters.append(c)
            cluster_prob[c] = counts_pos[c] / tot if tot>0 else 0.0
        else:
            normal_clusters.append(c)
            cluster_prob[c] = counts_neg[c] / tot if tot>0 else 0.0

    y_km = np.array([1 if c in attack_clusters else 0 for c in test_labels])

    print("CL-k-means Performance:")
    print(classification_report(y_test, y_km))
    cm = confusion_matrix(y_test, y_km)
    tn, fp, fn, tp = cm.ravel()
    dr = tp/(tp+fn) if tp+fn>0 else 0
    far = fp/(fp+tn) if fp+tn>0 else 0
    print(f"  DR: {dr:.4f}, FAR: {far:.4f}\n  CM:\n{cm}\n")

    y_train_km = np.array([1 if c in attack_clusters else 0 for c in train_labels])
    fp_idx = np.where((y_train_km == 1) & (y_train == 0))[0]
    fn_idx = np.where((y_train_km == 0) & (y_train == 1))[0]

    X_fp = X_train[fp_idx]
    X_fn = X_train[fn_idx]
    X_norm = X_train[y_train == 0]
    X_adv  = X_train[y_train == 1]

    if len(X_fp) and len(X_adv):
        attack_samples_for_fp = X_adv[np.random.choice(len(X_adv), size=len(X_fp),replace=True)]
        Xp = np.concatenate([X_fp, attack_samples_for_fp])
        yp = np.concatenate([np.zeros(len(X_fp)), np.ones(len(X_fp))])
        yp = yp.astype(np.int64)
        y_counts = np.bincount(yp)
        print(y_counts)
        min_groups = np.min(y_counts)
        if min_groups >= 2:
          opt_rfp = BayesSearchCV(
              RandomForestClassifier(random_state=0),
              {
                  'n_estimators': Integer(10,200),
                  'max_depth':    Integer(3,50),
                  'min_samples_split': Integer(2,10)
              },
              n_iter=20,
              cv=StratifiedKFold(5, shuffle=True, random_state=0),
              scoring='f1',
              n_jobs=-1,
              random_state=0
          )
          opt_rfp.fit(Xp, yp)
          rfp = opt_rfp.best_estimator_
        else:
          rfp = RandomForestClassifier(random_state=0).fit(X_train, y_train)
    else:
        rfp = RandomForestClassifier(random_state=0).fit(X_train, y_train)

    if len(X_fn) and len(X_norm):
        normal_samples_for_fn = X_norm[np.random.choice(len(X_norm), size=len(X_fn),replace=True)]
        Xn = np.concatenate([X_fn, normal_samples_for_fn])
        yn = np.concatenate([np.zeros(len(X_fn)), np.ones(len(X_fn))])
        yn = yn.astype(np.int64)
        y_counts = np.bincount(yn)
        print(y_counts)
        min_groups = np.min(y_counts)

        if min_groups >= 2:
          opt_rfn = BayesSearchCV(
              RandomForestClassifier(random_state=0),
              {
                  'n_estimators': Integer(10,200),
                  'max_depth':    Integer(3,50),
                  'min_samples_split': Integer(2,10)
              },
              n_iter=20,
              cv=StratifiedKFold(5, shuffle=True, random_state=0),
              scoring='f1',
              n_jobs=-1,
              random_state=0
          )
          opt_rfn.fit(Xn, yn)
          rfn = opt_rfn.best_estimator_
        else:
          rfn = RandomForestClassifier(random_state=0).fit(X_train, y_train)  # fallback
    else:
        rfn = RandomForestClassifier(random_state=0).fit(X_train, y_train)  # fallback

    probs = np.array([cluster_prob.get(c,0.0) for c in test_labels])
    best_thr, best_rec = 0.5, 0.0
    for thr in np.linspace(0.5, 0.99, 50):
        y_tmp = y_km.copy()
        for i, p in enumerate(probs):
            if p < thr:
                if y_tmp[i] == 0:
                    y_tmp[i] = rfn.predict(X_test[i].reshape(1,-1))[0]
                else:
                    y_tmp[i] = rfp.predict(X_test[i].reshape(1,-1))[0]
        rec = recall_score(y_test, y_tmp)
        if rec > best_rec:
            best_rec, best_thr = rec, thr

    y_final = y_km.copy()
    for i, p in enumerate(probs):
        if p < best_thr:
            if y_final[i] == 0:
                y_final[i] = rfn.predict(X_test[i].reshape(1,-1))[0]
            else:
                y_final[i] = rfp.predict(X_test[i].reshape(1,-1))[0]


    print("MTH-IDS Performance:")
    print(classification_report(y_test, y_final))
    cm2 = confusion_matrix(y_test, y_final)

    tn, fp, fn, tp = cm2.ravel()
    dr2 = tp/(tp+fn) if tp+fn>0 else 0
    far2 = fp/(fp+tn) if fp+tn>0 else 0
    acc2 = accuracy_score(y_test, y_final)

    print(f"  Acc: {acc2:.4f}, DR: {dr2:.4f}, FAR: {far2:.4f}\n  CM:\n{cm2}")

    return acc2, dr2, far2, cm2

In [42]:
X_train = X_kpca[:len(df1)]
y_train = y[:len(df1)]
X_test = X_kpca[len(df1):]
y_test = y[len(df1):]
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy={0:24346})
X_train, y_train = smote.fit_resample(X_train, y_train)

In [43]:
one_acc, one_dr, one_far, one_f1 = Anomaly_IDS(X_train, X_test, y_train, y_test, res_gp.x[0])

CL-k-means Performance:
              precision    recall  f1-score   support

           0       0.81      0.73      0.77      1448
           1       0.76      0.83      0.79      1448

    accuracy                           0.78      2896
   macro avg       0.78      0.78      0.78      2896
weighted avg       0.78      0.78      0.78      2896

  DR: 0.8260, FAR: 0.2673
  CM:
[[1061  387]
 [ 252 1196]]



[6229 6229]
[1365 1365]
MTH-IDS Performance:
              precision    recall  f1-score   support

           0       0.81      0.73      0.77      1448
           1       0.76      0.83      0.79      1448

    accuracy                           0.78      2896
   macro avg       0.78      0.78      0.78      2896
weighted avg       0.78      0.78      0.78      2896

  Acc: 0.7794, DR: 0.8260, FAR: 0.2673
  CM:
[[1061  387]
 [ 252 1196]]
