In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

In [3]:
# Step 1: Load the Data
# Replace 'path_to_train_file.csv' and 'path_to_test_file.csv' with the actual file paths
train_data = pd.read_csv('/Users/marlenawasiak/Desktop/Data_Collection/NSL_KDD_Train.csv')
test_data = pd.read_csv('/Users/marlenawasiak/Desktop/Data_Collection/NSL_KDD_Test.csv')

In [4]:

# Step 2: Separate Features and Labels
# Assuming the last column is the label and the rest are features
X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

# Step 3: Align Columns to Keep Only the Common Columns
common_columns = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_columns]
X_test = X_test[common_columns]

# Step 4: Encode Categorical Features with Consistent Categories
categorical_columns = X_train.select_dtypes(include=['object']).columns

for col in categorical_columns:
    combined_categories = pd.concat([X_train[col], X_test[col]], axis=0).astype("category").cat.categories
    X_train[col] = pd.Categorical(X_train[col], categories=combined_categories)
    X_test[col] = pd.Categorical(X_test[col], categories=combined_categories)
    
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

# Step 5: Scale Numerical Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 6: Encode Labels with Combined Categories
# Combine `y_train` and `y_test` to fit LabelEncoder on all possible labels
all_labels = pd.concat([y_train, y_test], axis=0)
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Print the unique classes in the label
print("Label classes:", label_encoder.classes_)


Label classes: ['apache2' 'back' 'buffer_overflow' 'ftp_write' 'guess_passwd'
 'httptunnel' 'imap' 'ipsweep' 'land' 'loadmodule' 'mailbomb' 'mscan'
 'multihop' 'named' 'neptune' 'nmap' 'normal' 'perl' 'phf' 'pod'
 'portsweep' 'processtable' 'ps' 'rootkit' 'saint' 'satan' 'sendmail'
 'smurf' 'snmpgetattack' 'snmpguess' 'spy' 'sqlattack' 'teardrop'
 'udpstorm' 'warezclient' 'warezmaster' 'worm' 'xlock' 'xsnoop' 'xterm']


In [8]:

# Step 7: Train the Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300, 500],      # Number of trees in the forest
    'max_depth': [10, 20, 30, None],           # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],           # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],             # Minimum samples required to be at a leaf node
    'class_weight': ['balanced', 'balanced_subsample']  # Handling imbalance
}
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV
rf_random = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_grid,
    n_iter=20,                 # Number of parameter combinations to try
    cv=3,                      # Number of cross-validation folds
    verbose=2,                 # Level of logging
    random_state=42,
    n_jobs=-1                  # Use all available cores
)

# Run the hyperparameter search on the training data
rf_random.fit(X_train_scaled, y_train_encoded)

# Retrieve the best model after search
best_rf_model = rf_random.best_estimator_
print("Best Parameters:", rf_random.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits




Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 30, 'class_weight': 'balanced'}


In [11]:

# Step 7: Train the Random Forest Model
rf_model = RandomForestClassifier(n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=1,
    class_weight='balanced',
   random_state=42)
rf_model.fit(X_train_scaled, y_train_encoded)
print("Model training complete.")

# Step 8: Evaluate the Model
# Predict on the test set
y_pred = rf_model.predict(X_test_scaled)

# Decode predictions back to original labels for readability
y_pred_labels = label_encoder.inverse_transform(y_pred)
y_test_labels = label_encoder.inverse_transform(y_test_encoded)

# Confusion Matrix and Classification Report
print("Confusion Matrix:")
print(confusion_matrix(y_test_labels, y_pred_labels))
print("\nClassification Report:")
print(classification_report(y_test_labels, y_pred_labels,zero_division=1))

Model training complete.
Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Classification Report:
                 precision    recall  f1-score   support

        apache2       1.00      0.00      0.00       737
           back       1.00      0.00      0.00       359
buffer_overflow       1.00      0.00      0.00        20
      ftp_write       1.00      0.00      0.00         3
   guess_passwd       1.00      0.00      0.00      1231
     httptunnel       1.00      0.00      0.00       133
           imap       1.00      0.00      0.00         1
        ipsweep       1.00      0.00      0.00       141
           land       0.00      0.00      0.00         7
     loadmodule       1.00      0.00      0.00         2
       mailbomb       1.00      0.00      0.00       293
          mscan       1.00      0.00      0.00       996
       multihop       1.00      0.00      0.00        18
          name

In [67]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Extract critical feature indices manually from the screenshots
critical_features = [2, 1, 8, 23, 20, 26]  # Replace these indices with those from your SHAP analysis.

# Step 2: Implement a targeted attack
def targeted_attack(X, critical_features, epsilon=3.0):
    """
    Apply a targeted perturbation attack on the dataset.
    
    Parameters:
    X (array): The input dataset.
    critical_features (list): Indices of critical features to perturb.
    epsilon (float): The perturbation amount.

    Returns:
    X_perturbed (array): The perturbed dataset.
    """
    X_perturbed = X.copy()
    for feature_idx in critical_features:
        perturbation = np.random.uniform(-epsilon, epsilon, size=X_perturbed.shape[0])
        X_perturbed[:, feature_idx] += perturbation
    return X_perturbed

# Step 3: Apply the attack
X_test_perturbed = targeted_attack(X_test_scaled, critical_features, epsilon=3.0)

# Step 4: Evaluate the model on the perturbed dataset
y_pred_perturbed = rf_model.predict(X_test_perturbed)

# Decode predictions back to original labels for readability
y_pred_perturbed_labels = label_encoder.inverse_transform(y_pred_perturbed)

# Step 5: Generate evaluation metrics
print("Confusion Matrix After Targeted Attack:")
print(confusion_matrix(y_test_labels, y_pred_perturbed_labels))

print("\nClassification Report After Targeted Attack:")
print(classification_report(y_test_labels, y_pred_perturbed_labels, zero_division=1))


Confusion Matrix After Targeted Attack:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Classification Report After Targeted Attack:
                 precision    recall  f1-score   support

        apache2       1.00      0.00      0.00       737
           back       1.00      0.00      0.00       359
buffer_overflow       1.00      0.00      0.00        20
      ftp_write       1.00      0.00      0.00         3
   guess_passwd       1.00      0.00      0.00      1231
     httptunnel       1.00      0.00      0.00       133
           imap       1.00      0.00      0.00         1
        ipsweep       0.00      0.00      0.00       141
           land       0.00      0.00      0.00         7
     loadmodule       1.00      0.00      0.00         2
       mailbomb       1.00      0.00      0.00       293
          mscan       1.00      0.00      0.00       996
       multihop       0.00      0.00      0.00      

In [6]:
import numpy as np
from art.estimators.classification import SklearnClassifier
from art.attacks.evasion import BoundaryAttack
from art.attacks.evasion import HopSkipJump



In [45]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Step 1: Create XGBoost DMatrix for training and testing
# XGBoost requires data to be converted into DMatrix format for efficient training
dtrain = xgb.DMatrix(X_train_scaled, label=y_train_encoded)
dtest = xgb.DMatrix(X_test_scaled, label=y_test_encoded)

# Step 2: Set up parameters for the XGBoost model
params = {
    'objective': 'multi:softmax',  # for multi-class classification
    'num_class': len(label_encoder.classes_),  # number of classes
    'eval_metric': 'mlogloss',  # evaluation metric
    'max_depth': 6,  # maximum depth of the tree
    'eta': 0.1,  # learning rate
    'subsample': 0.8,  # fraction of samples to use for each tree
    'colsample_bytree': 0.8,  # fraction of features to use for each tree
    'seed': 42  # for reproducibility
}

# Step 3: Train the model
num_round = 100  # number of training rounds
bst = xgb.train(params, dtrain, num_round)

# Step 4: Make predictions
y_pred = bst.predict(dtest)

import numpy as np
# Ensure predictions are integers within the valid range
y_pred = y_pred.astype(int)  # Ensure predictions are integer values

# Filter `unique_labels` to contain only valid indices within the range of `label_encoder.classes_`
unique_labels = np.unique(np.concatenate([y_test_encoded, y_pred]))
unique_labels = unique_labels[unique_labels < len(label_encoder.classes_)]

# Get filtered target names based on `unique_labels`
filtered_target_names = [label_encoder.classes_[i] for i in unique_labels]

# Print the classification report with the filtered target names
from sklearn.metrics import classification_report

# Step 5: Evaluate the Model
print("Accuracy:", accuracy_score(y_test_encoded, y_pred))
print("Classification Report:\n", classification_report(y_test_encoded, y_pred, labels=unique_labels, target_names=filtered_target_names,zero_division=1))


Accuracy: 0.34649336822960564
Classification Report:
                  precision    recall  f1-score   support

        apache2       1.00      0.00      0.00       737
           back       1.00      0.00      0.00       359
buffer_overflow       1.00      0.00      0.00        20
      ftp_write       1.00      0.00      0.00         3
   guess_passwd       0.00      0.00      0.00      1231
     httptunnel       1.00      0.00      0.00       133
           imap       1.00      0.00      0.00         1
        ipsweep       1.00      0.00      0.00       141
           land       0.00      0.00      0.00         7
     loadmodule       1.00      0.00      0.00         2
       mailbomb       1.00      0.00      0.00       293
          mscan       1.00      0.00      0.00       996
       multihop       1.00      0.00      0.00        18
          named       1.00      0.00      0.00        17
        neptune       0.69      0.32      0.44      4656
           nmap       1.00      0

In [65]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
critical_features = [2, 1, 8, 3, 24, 26]
# Amplified perturbation with a selective feature targeting mechanism
def stronger_perturb_features(X, feature_indices, noise_factor=0.5, target_fraction=0.5):
    """
    Apply amplified random noise to a subset of critical features for a stronger attack.
    Args:
        X: The feature matrix (numpy array).
        feature_indices: List of indices for critical features to perturb.
        noise_factor: The scale of the noise to apply.
        target_fraction: Fraction of samples to perturb.
    Returns:
        X_perturbed: The perturbed feature matrix.
    """
    X_perturbed = X.copy()
    num_samples = X.shape[0]
    # Determine the samples to attack (randomly select a fraction)
    attack_samples = np.random.choice(
        num_samples, size=int(num_samples * target_fraction), replace=False
    )
    
    for idx in feature_indices:
        # Apply stronger noise only to the selected samples
        noise = np.random.normal(loc=0, scale=noise_factor, size=len(attack_samples))
        X_perturbed[attack_samples, idx] += noise
    return X_perturbed

# Apply a stronger perturbation to the test set
X_test_stronger_perturbed = stronger_perturb_features(
    X_test_scaled, critical_feature_indices, noise_factor=5.5, target_fraction=1.0
)

# Predict on the stronger perturbed test set using the XGBoost model
dtest_stronger_perturbed = xgb.DMatrix(X_test_stronger_perturbed)
y_pred_stronger_perturbed = bst.predict(dtest_stronger_perturbed)

# Ensure predictions are integers within the valid range
y_pred_stronger_perturbed = y_pred_stronger_perturbed.astype(int)

# Filter `unique_labels` to contain only valid indices within the range of `label_encoder.classes_`
unique_labels = np.unique(np.concatenate([y_test_encoded, y_pred_stronger_perturbed]))
unique_labels = unique_labels[unique_labels < len(label_encoder.classes_)]

# Get filtered target names based on `unique_labels`
filtered_target_names = [label_encoder.classes_[i] for i in unique_labels]

# Evaluate the model's performance after the stronger attack
print("Accuracy After Stronger Attack:", accuracy_score(y_test_encoded, y_pred_stronger_perturbed))
print("Classification Report After Stronger Attack:\n", classification_report(
    y_test_encoded, y_pred_stronger_perturbed, labels=unique_labels, target_names=filtered_target_names, zero_division=1))
print("Confusion Matrix After Stronger Targeted Attack:\n", confusion_matrix(y_test_encoded, y_pred_stronger_perturbed))


Accuracy After Stronger Attack: 0.16253382424699464
Classification Report After Stronger Attack:
                  precision    recall  f1-score   support

        apache2       1.00      0.00      0.00       737
           back       1.00      0.00      0.00       359
buffer_overflow       1.00      0.00      0.00        20
      ftp_write       1.00      0.00      0.00         3
   guess_passwd       0.00      0.00      0.00      1231
     httptunnel       1.00      0.00      0.00       133
           imap       0.00      0.00      0.00         1
        ipsweep       0.00      0.00      0.00       141
           land       0.00      0.00      0.00         7
     loadmodule       0.00      0.00      0.00         2
       mailbomb       1.00      0.00      0.00       293
          mscan       1.00      0.00      0.00       996
       multihop       1.00      0.00      0.00        18
          named       1.00      0.00      0.00        17
        neptune       0.73      0.09      0.17