In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

In [None]:

train_data = pd.read_csv('/Users/marlenawasiak/Desktop/Data_Collection/NSL_KDD_Train.csv')
test_data = pd.read_csv('/Users/marlenawasiak/Desktop/Data_Collection/NSL_KDD_Test.csv')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd
from collections import Counter

X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

common_columns = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_columns]
X_test = X_test[common_columns]

categorical_columns = X_train.select_dtypes(include=['object']).columns

for col in categorical_columns:
    combined_categories = pd.concat([X_train[col], X_test[col]], axis=0).astype("category").cat.categories
    X_train[col] = pd.Categorical(X_train[col], categories=combined_categories)
    X_test[col] = pd.Categorical(X_test[col], categories=combined_categories)
    
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

all_labels = pd.concat([y_train, y_test], axis=0)
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

class_counts = Counter(y_train_encoded)
min_class_size = min(class_counts.values())
smote = SMOTE(random_state=42, k_neighbors=min(min_class_size - 1, 5))

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train_encoded)

print(f"Original training data shape: {X_train_scaled.shape}, {y_train_encoded.shape}")
print(f"Resampled training data shape: {X_train_resampled.shape}, {y_train_resampled.shape}")


Label classes: ['apache2' 'back' 'buffer_overflow' 'ftp_write' 'guess_passwd'
 'httptunnel' 'imap' 'ipsweep' 'land' 'loadmodule' 'mailbomb' 'mscan'
 'multihop' 'named' 'neptune' 'nmap' 'normal' 'perl' 'phf' 'pod'
 'portsweep' 'processtable' 'ps' 'rootkit' 'saint' 'satan' 'sendmail'
 'smurf' 'snmpgetattack' 'snmpguess' 'spy' 'sqlattack' 'teardrop'
 'udpstorm' 'warezclient' 'warezmaster' 'worm' 'xlock' 'xsnoop' 'xterm']
Original training data shape: (125972, 28), (125972,)
Resampled training data shape: (1548866, 28), (1548866,)
Model training complete.

Classification Report:
                 precision    recall  f1-score   support

        apache2       1.00      0.00      0.00       737
           back       1.00      0.00      0.00       359
buffer_overflow       1.00      0.00      0.00        20
      ftp_write       1.00      0.00      0.00         3
   guess_passwd       1.00      0.00      0.00      1231
     httptunnel       1.00      0.00      0.00       133
           imap   

In [None]:

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=50,
    min_samples_split=5,
    min_samples_leaf=1,
    random_state=42
)
rf_model.fit(X_train_resampled, y_train_resampled)
print("Model training complete.")
y_pred = rf_model.predict(X_test_scaled)

y_pred_labels = label_encoder.inverse_transform(y_pred)
y_test_labels = label_encoder.inverse_transform(y_test_encoded)

print("\nClassification Report:")
print(classification_report(y_test_labels, y_pred_labels, zero_division=1))


Model training complete.

Classification Report:
                 precision    recall  f1-score   support

        apache2       1.00      0.00      0.00       737
           back       1.00      0.00      0.00       359
buffer_overflow       1.00      0.00      0.00        20
      ftp_write       1.00      0.00      0.00         3
   guess_passwd       1.00      0.00      0.00      1231
     httptunnel       1.00      0.00      0.00       133
           imap       1.00      0.00      0.00         1
        ipsweep       1.00      0.00      0.00       141
           land       0.00      0.00      0.00         7
     loadmodule       1.00      0.00      0.00         2
       mailbomb       1.00      0.00      0.00       293
          mscan       1.00      0.00      0.00       996
       multihop       1.00      0.00      0.00        18
          named       1.00      0.00      0.00        17
        neptune       0.73      0.32      0.45      4656
           nmap       0.00      0.00  

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

critical_features = [2, 1, 8, 23, 20, 26] 

def targeted_attack(X, critical_features, epsilon=3.0):
    """
    Apply a targeted perturbation attack on the dataset.
    
    Parameters:
    X (array): The input dataset.
    critical_features (list): Indices of critical features to perturb.
    epsilon (float): The perturbation amount.

    Returns:
    X_perturbed (array): The perturbed dataset.
    """
    X_perturbed = X.copy()
    for feature_idx in critical_features:
        perturbation = np.random.uniform(-epsilon, epsilon, size=X_perturbed.shape[0])
        X_perturbed[:, feature_idx] += perturbation
    return X_perturbed

X_test_perturbed = targeted_attack(X_test_scaled, critical_features, epsilon=3.0)

y_pred_perturbed = rf_model.predict(X_test_perturbed)

y_pred_perturbed_labels = label_encoder.inverse_transform(y_pred_perturbed)

print("\nClassification Report After Targeted Attack:")
print(classification_report(y_test_labels, y_pred_perturbed_labels, zero_division=1))



Classification Report After Targeted Attack:
                 precision    recall  f1-score   support

        apache2       1.00      0.00      0.00       737
           back       1.00      0.00      0.00       359
buffer_overflow       1.00      0.00      0.00        20
      ftp_write       1.00      0.00      0.00         3
   guess_passwd       1.00      0.00      0.00      1231
     httptunnel       1.00      0.00      0.00       133
           imap       1.00      0.00      0.00         1
        ipsweep       0.00      0.00      0.00       141
           land       0.00      0.00      0.00         7
     loadmodule       1.00      0.00      0.00         2
       mailbomb       1.00      0.00      0.00       293
          mscan       1.00      0.00      0.00       996
       multihop       0.00      0.00      0.00        18
          named       1.00      0.00      0.00        17
        neptune       0.60      0.06      0.11      4656
           nmap       0.00      0.00     

In [6]:
import numpy as np
from art.estimators.classification import SklearnClassifier
from art.attacks.evasion import BoundaryAttack
from art.attacks.evasion import HopSkipJump



In [None]:
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

from collections import Counter
class_counts = Counter(y_train_encoded)
min_class_size = min(class_counts.values())
smote = SMOTE(random_state=42, k_neighbors=min(min_class_size - 1, 5))

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train_encoded)

print(f"Original training data shape: {X_train_scaled.shape}, {y_train_encoded.shape}")
print(f"Resampled training data shape: {X_train_resampled.shape}, {y_train_resampled.shape}")

dtrain = xgb.DMatrix(X_train_resampled, label=y_train_resampled)
dtest = xgb.DMatrix(X_test_scaled, label=y_test_encoded)

params = {
    'objective': 'multi:softmax',  
    'num_class': len(label_encoder.classes_), 
    'eval_metric': 'mlogloss',  
    'max_depth': 20,  
    'eta': 0.1,  
    'subsample': 0.8,  
    'colsample_bytree': 0.8,  
    'seed': 42  
}

num_round = 100  
bst = xgb.train(params, dtrain, num_round)
y_pred = bst.predict(dtest)
y_pred = y_pred.astype(int)  
unique_labels = np.unique(np.concatenate([y_test_encoded, y_pred]))
unique_labels = unique_labels[unique_labels < len(label_encoder.classes_)]

filtered_target_names = [label_encoder.classes_[i] for i in unique_labels]

print("Accuracy:", accuracy_score(y_test_encoded, y_pred))
print("Classification Report:\n", classification_report(
    y_test_encoded, y_pred, labels=unique_labels, target_names=filtered_target_names, zero_division=1
))


Original training data shape: (125972, 28), (125972,)
Resampled training data shape: (1548866, 28), (1548866,)
Accuracy: 0.11453666326575877
Classification Report:
                  precision    recall  f1-score   support

        apache2       1.00      0.00      0.00       737
           back       1.00      0.00      0.00       359
buffer_overflow       1.00      0.00      0.00        20
      ftp_write       0.00      0.00      0.00         3
   guess_passwd       1.00      0.00      0.00      1231
     httptunnel       1.00      0.00      0.00       133
           imap       0.00      0.00      0.00         1
        ipsweep       1.00      0.00      0.00       141
           land       0.00      0.00      0.00         7
     loadmodule       1.00      0.00      0.00         2
       mailbomb       1.00      0.00      0.00       293
          mscan       1.00      0.00      0.00       996
       multihop       0.00      0.00      0.00        18
          named       1.00      0.00

In [None]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
critical_feature_indices = [2, 1, 8, 3, 24, 26]
def stronger_perturb_features(X, feature_indices, noise_factor=0.5, target_fraction=0.5):
    """
    Apply amplified random noise to a subset of critical features for a stronger attack.
    Args:
        X: The feature matrix (numpy array).
        feature_indices: List of indices for critical features to perturb.
        noise_factor: The scale of the noise to apply.
        target_fraction: Fraction of samples to perturb.
    Returns:
        X_perturbed: The perturbed feature matrix.
    """
    X_perturbed = X.copy()
    num_samples = X.shape[0]
    attack_samples = np.random.choice(
        num_samples, size=int(num_samples * target_fraction), replace=False
    )
    
    for idx in feature_indices:
        noise = np.random.normal(loc=0, scale=noise_factor, size=len(attack_samples))
        X_perturbed[attack_samples, idx] += noise
    return X_perturbed

X_test_stronger_perturbed = stronger_perturb_features(
    X_test_scaled, critical_feature_indices, noise_factor=5.5, target_fraction=1.0
)

dtest_stronger_perturbed = xgb.DMatrix(X_test_stronger_perturbed)
y_pred_stronger_perturbed = bst.predict(dtest_stronger_perturbed)
y_pred_stronger_perturbed = y_pred_stronger_perturbed.astype(int)

unique_labels = np.unique(np.concatenate([y_test_encoded, y_pred_stronger_perturbed]))
unique_labels = unique_labels[unique_labels < len(label_encoder.classes_)]

filtered_target_names = [label_encoder.classes_[i] for i in unique_labels]

print("Accuracy After Stronger Attack:", accuracy_score(y_test_encoded, y_pred_stronger_perturbed))
print("Classification Report After Stronger Attack:\n", classification_report(
    y_test_encoded, y_pred_stronger_perturbed, labels=unique_labels, target_names=filtered_target_names, zero_division=1))
print("Confusion Matrix After Stronger Targeted Attack:\n", confusion_matrix(y_test_encoded, y_pred_stronger_perturbed))


Accuracy After Stronger Attack: 0.17371246063079449
Classification Report After Stronger Attack:
                  precision    recall  f1-score   support

        apache2       1.00      0.00      0.00       737
           back       1.00      0.00      0.00       359
buffer_overflow       0.00      0.00      0.00        20
      ftp_write       0.00      0.00      0.00         3
   guess_passwd       0.00      0.00      0.00      1231
     httptunnel       1.00      0.00      0.00       133
           imap       0.00      0.00      0.00         1
        ipsweep       0.00      0.00      0.00       141
           land       0.00      0.00      0.00         7
     loadmodule       0.33      0.50      0.40         2
       mailbomb       1.00      0.00      0.00       293
          mscan       1.00      0.00      0.00       996
       multihop       0.01      0.06      0.01        18
          named       1.00      0.00      0.00        17
        neptune       0.75      0.18      0.29