In [83]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import os
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.pipeline import Pipeline

#### Loading the dataset

In [84]:
directory_path = 'Dataset/new_feature_csv'
regular_data = []
attack_data = []
for subfolder in os.listdir(directory_path):
    subfolder_path = os.path.join(directory_path, subfolder)
    if os.path.isdir(subfolder_path):
        for filename in os.listdir(subfolder_path):
            if filename.endswith(".csv"):
                file_path = os.path.join(subfolder_path, filename)
                df = pd.read_csv(file_path)
                if subfolder == 'Regular':
                    df['Label'] = -1
                    regular_data.append(df)
                else:
                    df['Label'] = 1
                    attack_data.append(df)
all_regular_raw_data = pd.concat(regular_data, ignore_index=True)
all_attack_raw_data = pd.concat(attack_data, ignore_index=True)

#### Data Pre-processing

In [None]:
all_regular_data_sh=shuffle(all_regular_raw_data, random_state=42)
all_attack_data_sh=shuffle(all_attack_raw_data, random_state=42)

In [None]:
X_train_regular, X_remaining_regular = train_test_split(all_regular_data_sh, test_size=0.40, random_state=42)
X_validation_regular, X_test_regular = train_test_split(X_remaining_regular, test_size=0.50, random_state=42)
X_validation_attack, X_remaining_attack = train_test_split(all_attack_data_sh, test_size=0.80, random_state=42)
X_validation_combined = pd.concat([X_validation_regular, X_validation_attack], ignore_index=True)
X_validation_combined_sh = shuffle(X_validation_combined, random_state=42)
X_test_combined = pd.concat([X_test_regular, X_remaining_attack], ignore_index=True)
X_test_combined_sh = shuffle(X_test_combined, random_state=42)

#### Imputation and Normalization

In [None]:
y_tr = X_train_regular["Label"]
y_va = X_validation_combined_sh["Label"]
y_te = X_test_combined_sh["Label"]
X_tr = X_train_regular.drop('Label', axis=1)
X_va = X_validation_combined_sh.drop('Label', axis=1)
X_te = X_test_combined_sh.drop('Label', axis=1)
# normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_tr)
X_validation_scaled = scaler.transform(X_va)
X_test_scaled = scaler.transform(X_te)

In [89]:
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train_scaled)
X_validation_imputed = imputer.transform(X_validation_scaled)
X_test_imputed = imputer.transform(X_test_scaled)

#### PCA

In [90]:
pca = PCA(n_components=0.95) # keep 95% of the variance
X_train_pca = pca.fit_transform(X_train_imputed)
X_validation_pca=pca.transform(X_validation_imputed)
X_test_pca = pca.transform(X_test_imputed)

In [91]:
print(X_train_pca.shape)

(768488, 15)


#### Support Function

In [None]:
def add_noise(X, noise_level):
    noise = np.random.normal(0, noise_level, X.shape)
    X_noisy = X + noise
    return X_noisy
def add_drift(X, shift_factor):
    sdv = 0.1
    drift = np.random.normal(shift_factor, sdv, X.shape)
    X_drift=X + drift
    return X_drift

## IsolationForest

In [93]:
isof = IsolationForest(n_estimators=100, random_state=42)
isof.fit(X_train_pca)

In [94]:
y_isof_pred = -isof.predict(X_validation_pca)
conf_matrix_isof = confusion_matrix(y_va, y_isof_pred)
print("Confusion Matrix:")
print(conf_matrix_isof)
print(classification_report(y_va, y_isof_pred, target_names=['Regular','Attack']))

Confusion Matrix:
[[244203  11960]
 [ 10381 322653]]
              precision    recall  f1-score   support

     Regular       0.96      0.95      0.96    256163
      Attack       0.96      0.97      0.97    333034

    accuracy                           0.96    589197
   macro avg       0.96      0.96      0.96    589197
weighted avg       0.96      0.96      0.96    589197



###  Test

#### Cross Validation

In [95]:
pipeline_isof = Pipeline([
    ('pca', PCA(n_components=0.95)),
    ('iso', isof)
])
cv_scores_isof  = cross_val_score(pipeline_isof , X_train_imputed, -y_tr, cv=3, scoring='accuracy', n_jobs=-1)
print(f"Cross-validation scores: {cv_scores_isof}")
print(f"Mean cross-validation score: {np.mean(cv_scores_isof)}")

Cross-validation scores: [0.95604361 0.95189001 0.95135891]
Mean cross-validation score: 0.9530975086942113


#### Robustness to Noise

In [None]:
noise_level_isof=0.1
X_val_noisy_isof = add_noise(X_validation_pca, noise_level_isof)
y_isof_pred_noisy = -isof.predict(X_val_noisy_isof)
conf_matrix_noisy_isof = confusion_matrix(y_va, y_isof_pred_noisy)
report_noisy_isof = classification_report(y_va, y_isof_pred_noisy, target_names=['Regular','Attack'])
print("Confusion Matrix:")
print(conf_matrix_noisy_isof)
print(report_noisy_isof)

Confusion Matrix:
[[241923  14240]
 [  8788 324246]]
              precision    recall  f1-score   support

     Regular       0.96      0.94      0.95    256163
      Attack       0.96      0.97      0.97    333034

    accuracy                           0.96    589197
   macro avg       0.96      0.96      0.96    589197
weighted avg       0.96      0.96      0.96    589197



#### Drifting Test

In [None]:
shift_factor_isof=0.1
X_val_drifted_isof = add_drift(X_validation_pca, shift_factor_isof)
y_isof_pred_drifted = -isof.predict(X_val_drifted_isof)
conf_matrix_drifted_isof = confusion_matrix(y_va, y_isof_pred_drifted)
report_drifted_isof = classification_report(y_va, y_isof_pred_drifted, target_names=['Regular','Attack'])
print("Confusion Matrix:")
print(conf_matrix_drifted_isof)
print(report_drifted_isof)

Confusion Matrix:
[[239916  16247]
 [  7916 325118]]
              precision    recall  f1-score   support

     Regular       0.97      0.94      0.95    256163
      Attack       0.95      0.98      0.96    333034

    accuracy                           0.96    589197
   macro avg       0.96      0.96      0.96    589197
weighted avg       0.96      0.96      0.96    589197



## EllipticEnvelope

In [98]:
ell = EllipticEnvelope(random_state=42)
ell.fit(X_train_pca)

In [99]:
y_ell_pred = -ell.predict(X_validation_pca)
conf_matrix_ell = confusion_matrix(y_va, y_ell_pred)
print("Confusion Matrix:")
print(conf_matrix_ell)
print(classification_report(y_va, y_ell_pred, target_names=['Regular','Attack']))

Confusion Matrix:
[[230483  25680]
 [ 26990 306044]]
              precision    recall  f1-score   support

     Regular       0.90      0.90      0.90    256163
      Attack       0.92      0.92      0.92    333034

    accuracy                           0.91    589197
   macro avg       0.91      0.91      0.91    589197
weighted avg       0.91      0.91      0.91    589197



### Test

#### Cross Validation

In [100]:
pipeline_ell= Pipeline([
    ('pca', PCA(n_components=0.95)),
    ('iso', ell)
])
cv_scores_ell  = cross_val_score(pipeline_ell , X_train_imputed, -y_tr, cv=3, scoring='accuracy', n_jobs=-1)
print(f"Cross-validation scores: {cv_scores_ell}")
print(f"Mean cross-validation score: {np.mean(cv_scores_ell)}")

Cross-validation scores: [0.90070385 0.89995042 0.90001249]
Mean cross-validation score: 0.9002222543360959


#### Robustness to Noise

In [None]:
noise_level_ell=0.1
X_val_noisy_ell = add_noise(X_validation_pca, noise_level_ell)
y_ell_pred_noisy = -ell.predict(X_val_noisy_ell)
conf_matrix_noisy_ell = confusion_matrix(y_va, y_ell_pred_noisy)
report_noisy_ell = classification_report(y_va, y_ell_pred_noisy, target_names=['Regular','Attack'])
print("Confusion Matrix:")
print(conf_matrix_noisy_ell)
print(report_noisy_ell)

Confusion Matrix:
[[   941 255222]
 [   103 332931]]
              precision    recall  f1-score   support

     Regular       0.90      0.00      0.01    256163
      Attack       0.57      1.00      0.72    333034

    accuracy                           0.57    589197
   macro avg       0.73      0.50      0.37    589197
weighted avg       0.71      0.57      0.41    589197



#### Drifting Test

In [None]:
shift_factor_ell=0.1
X_val_drifted_ell = add_drift(X_validation_pca, shift_factor_ell)
y_ell_pred_drifted = -ell.predict(X_val_drifted_ell)
conf_matrix_drifted_ell = confusion_matrix(y_va, y_ell_pred_drifted)
report_drifted_ell = classification_report(y_va, y_ell_pred_drifted, target_names=['Regular','Attack'])
print("Confusion Matrix:")
print(conf_matrix_drifted_ell)
print(report_drifted_ell)

Confusion Matrix:
[[    81 256082]
 [     4 333030]]
              precision    recall  f1-score   support

     Regular       0.95      0.00      0.00    256163
      Attack       0.57      1.00      0.72    333034

    accuracy                           0.57    589197
   macro avg       0.76      0.50      0.36    589197
weighted avg       0.73      0.57      0.41    589197



## LocalOutlierFactor

In [103]:
lof = LocalOutlierFactor(n_neighbors=20, novelty=True)
lof.fit(X_train_pca)

In [104]:
y_lof_pred = -lof.predict(X_validation_pca)
conf_matrix_lof = confusion_matrix(y_va, y_lof_pred)
print("Confusion Matrix:")
print(conf_matrix_lof)
print(classification_report(y_va, y_lof_pred, target_names=['Regular','Attack']))

Confusion Matrix:
[[251054   5109]
 [  1543 331491]]
              precision    recall  f1-score   support

     Regular       0.99      0.98      0.99    256163
      Attack       0.98      1.00      0.99    333034

    accuracy                           0.99    589197
   macro avg       0.99      0.99      0.99    589197
weighted avg       0.99      0.99      0.99    589197



### Test

#### Cross Validation

In [105]:
pipeline_lof= Pipeline([
    ('pca', PCA(n_components=0.95)),
    ('iso', lof)
])
cv_scores_lof  = cross_val_score(pipeline_lof , X_train_imputed, -y_tr, cv=3, scoring='accuracy', n_jobs=-1)
print(f"Cross-validation scores: {cv_scores_lof}")
print(f"Mean cross-validation score: {np.mean(cv_scores_lof)}")

Cross-validation scores: [0.97961064 0.9792632  0.98007511]
Mean cross-validation score: 0.9796496502553825


#### Robustness to Noise

In [None]:
noise_level_lof=0.1
X_val_noisy_lof = add_noise(X_validation_pca, noise_level_lof)
y_lof_pred_noisy = -lof.predict(X_val_noisy_lof)
conf_matrix_noisy_lof = confusion_matrix(y_va, y_lof_pred_noisy)
report_noisy_lof = classification_report(y_va, y_lof_pred_noisy, target_names=['Regular','Attack'])
print("Confusion Matrix:")
print(conf_matrix_noisy_lof)
print(report_noisy_lof)

Confusion Matrix:
[[203455  52708]
 [  1551 331483]]
              precision    recall  f1-score   support

     Regular       0.99      0.79      0.88    256163
      Attack       0.86      1.00      0.92    333034

    accuracy                           0.91    589197
   macro avg       0.93      0.89      0.90    589197
weighted avg       0.92      0.91      0.91    589197



#### Drifting Test

In [None]:
shift_factor_lof=0.1
X_val_drifted_lof = add_drift(X_validation_pca, shift_factor_lof)
y_lof_pred_drifted = -lof.predict(X_val_drifted_lof)
conf_matrix_drifted_lof = confusion_matrix(y_va, y_lof_pred_drifted)
report_drifted_lof = classification_report(y_va, y_lof_pred_drifted, target_names=['Regular','Attack'])
print("Confusion Matrix:")
print(conf_matrix_drifted_lof)
print(report_drifted_lof)

Confusion Matrix:
[[128018 128145]
 [  1513 331521]]
              precision    recall  f1-score   support

     Regular       0.99      0.50      0.66    256163
      Attack       0.72      1.00      0.84    333034

    accuracy                           0.78    589197
   macro avg       0.85      0.75      0.75    589197
weighted avg       0.84      0.78      0.76    589197

