In [14]:
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import os
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
from sklearn.pipeline import Pipeline
import joblib
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score,roc_auc_score,roc_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score, learning_curve

#### Loading the dataset

In [15]:
directory_path = 'Dataset/new_feature_csv'
regular_data = []
attack_data = []
for subfolder in os.listdir(directory_path):
    subfolder_path = os.path.join(directory_path, subfolder)
    if os.path.isdir(subfolder_path):
        for filename in os.listdir(subfolder_path):
            if filename.endswith(".csv"):
                file_path = os.path.join(subfolder_path, filename)
                df = pd.read_csv(file_path)
                if subfolder == 'Regular':
                    df['Label'] = -1
                    regular_data.append(df)
                else:
                    df['Label'] = 1
                    attack_data.append(df)
all_regular_raw_data = pd.concat(regular_data, ignore_index=True)
all_attack_raw_data = pd.concat(attack_data, ignore_index=True)

#### Data Preprocessing

In [None]:
all_regular_data_sh=shuffle(all_regular_raw_data, random_state=42)
all_attack_data_sh=shuffle(all_attack_raw_data, random_state=42)

In [18]:
X_train_regular, X_remaining_regular = train_test_split(all_regular_data_sh, test_size=0.40, random_state=42)
X_validation_regular, X_test_regular = train_test_split(X_remaining_regular, test_size=0.50, random_state=42)
X_validation_attack, X_remaining_attack = train_test_split(all_attack_data_sh, test_size=0.80, random_state=42)
X_test_combined = pd.concat([X_test_regular, X_remaining_attack], ignore_index=True)
X_test_combined_sh = shuffle(X_test_combined, random_state=42)

#### Imputation and Normalization

In [19]:
y_tr=X_train_regular["Label"]
y_te=X_test_combined_sh["Label"]
X_tr = X_train_regular.drop('Label', axis=1)
X_te = X_test_combined_sh.drop('Label', axis=1)

In [20]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='median')),
    ('pca', PCA(n_components=0.95)) 
])

In [21]:
X_train_pca = pipeline.fit_transform(X_tr)
X_test_pca = pipeline.transform(X_te)

#### Examine The Result

##### Loading Pre-trained Model

In [22]:
best_model_filename = "best_isolation_forest_50_0.75_0.01_0.5.pkl"
best_model = joblib.load(best_model_filename)

#### Performance Evaluation Metrics

In [None]:
def add_noise(X, noise_level):
    noise = np.random.normal(0, noise_level, X.shape)
    X_noisy = X + noise
    return X_noisy
def add_drift(X, shift_factor):
    sdv = 0.1
    drift = np.random.normal(shift_factor, sdv, X.shape)
    X_drift=X + drift
    return X_drift
def performance_evalidation(y,pred):
    accuracy = accuracy_score(y, pred)
    precision = precision_score(y, pred)
    recall = recall_score(y, pred)
    f1 = f1_score(y, pred)
    roc_auc = roc_auc_score(y, pred)
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')
    print(f'ROC AUC: {roc_auc}')
    conf_matrix = confusion_matrix(y, pred,labels=[1,-1])
    print('Confusion Matrix:')
    print(conf_matrix)

##### Performance Metrics

In [None]:
y_te_pred = -best_model.predict(X_test_pca)
performance_evalidation(y_te,y_te_pred)

Accuracy: 0.9984492863758192
Precision: 0.9981545046047471
Recall: 1.0
F1 Score: 0.9990764000526485
ROC AUC: 0.9951925141413865
Confusion Matrix:
[[1332138       0]
 [   2463  253700]]


##### Robustness to Noise

In [None]:
noise_level_isof=0.1
X_te_noisy_isof = add_noise(X_test_pca, noise_level_isof)
y_isof_pred_noisy = -best_model.predict(X_te_noisy_isof)
performance_evalidation(y_te,y_isof_pred_noisy)

Accuracy: 0.9982471836257737
Precision: 0.9979144848912521
Recall: 1.0
F1 Score: 0.9989561539672899
ROC AUC: 0.9945659599551847
Confusion Matrix:
[[1332138       0]
 [   2784  253379]]


##### Drifting Test

In [None]:
shift_factor_isof=0.1
X_te_drifted_isof = add_drift(X_test_pca, shift_factor_isof)
y_isof_pred_drifted = -best_model.predict(X_te_drifted_isof)
performance_evalidation(y_te,y_isof_pred_drifted)

Accuracy: 0.9980759314512804
Precision: 0.9977111940287329
Recall: 1.0
F1 Score: 0.998854285855679
ROC AUC: 0.9940350479967832
Confusion Matrix:
[[1332138       0]
 [   3056  253107]]
