In [16]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [18]:
# Load training and testing sets from separate files
train_data = pd.read_csv("/Users/marlenawasiak/Desktop/Data_Collection/UNSW_NB15_training-set.csv")
test_data = pd.read_csv("/Users/marlenawasiak/Desktop/Data_Collection/UNSW_NB15_testing-set.csv")



In [19]:
X_train = train_data.drop(columns=['attack_cat'])
y_train = train_data['attack_cat']

X_test = test_data.drop(columns=['attack_cat'])
y_test = test_data['attack_cat']

# Encode categorical features
categorical_features = ['proto', 'service', 'state']
X_train = pd.get_dummies(X_train, columns=categorical_features, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)

# Align test set columns with training set columns (fill missing columns with 0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Encode the target label (attack category) in both training and test sets
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)


In [22]:
# Apply oversampling to the training set
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Scale the training and test sets
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)  # Fit and transform on training set
X_test_scaled = scaler.transform(X_test)                     # Only transform the test set

In [23]:
# Initialize and train the Random Forest model with class weight adjustments
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    min_samples_split=10,
    class_weight='balanced',  # Helps handle any remaining class imbalance
    random_state=42
)

# Train the model on the resampled and scaled training set
rf_model.fit(X_train_resampled, y_train_resampled)

# Evaluate on the scaled test set
y_pred = rf_model.predict(X_test_scaled)

# Decode the target labels back to their original categories for interpretation
y_test_decoded = label_encoder.inverse_transform(y_test)
y_pred_decoded = label_encoder.inverse_transform(y_pred)

# Print the evaluation metrics
print("Confusion Matrix:")
print(confusion_matrix(y_test_decoded, y_pred_decoded))

print("\nClassification Report:")
print(classification_report(y_test_decoded, y_pred_decoded))

Confusion Matrix:
[[    0   584    20    40    18    14     1     0     0     0]
 [    0   490    20     6     9    14    22     2    17     3]
 [   19  2740    38   454    87    14   360    55   205   117]
 [  116  2930    62  4808   506    28   990    70   565  1057]
 [    0  1180    42    17  3365    28   416   138   761   115]
 [    5    36     0   290    91 18142   104     5   154    44]
 [ 1809     0     0    43  6639     0 27816   200   395    98]
 [    1   308     2     2    43     0   124  2468   158   390]
 [    0     0     0     0     5     0    25    16   332     0]
 [    0     0     0     0     2     0     0     0     8    34]]

Classification Report:
                precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00       677
      Backdoor       0.06      0.84      0.11       583
           DoS       0.21      0.01      0.02      4089
      Exploits       0.85      0.43      0.57     11132
       Fuzzers       0.31      0.56      0.40 

In [63]:
def extreme_targeted_attack_v3(X, critical_feature_indices, epsilon_critical=2.5, epsilon_noise=0.3, iterations=10):
    """
    Applies an even stronger targeted perturbation.
    Args:
    - X: Test dataset (scaled).
    - critical_feature_indices: Indices of critical features to perturb.
    - epsilon_critical: Magnitude of perturbation for critical features.
    - epsilon_noise: Magnitude of noise for non-critical features.
    - iterations: Number of iterative perturbations to apply.
    Returns:
    - X_perturbed: Perturbed dataset.
    """
    X_perturbed = X.copy()
    
    for _ in range(iterations):
        # Perturb critical features with stronger perturbation
        for feature_idx in critical_feature_indices:
            perturbation = np.random.uniform(-epsilon_critical, epsilon_critical, size=X_perturbed.shape[0])
            X_perturbed[:, feature_idx] += perturbation
        
        # Add noise to all features
        for feature_idx in range(X.shape[1]):
            perturbation = np.random.normal(0, epsilon_noise, size=X_perturbed.shape[0])
            X_perturbed[:, feature_idx] += perturbation
    
    return X_perturbed

# Step 1: Apply the Extreme Targeted Attack
X_test_perturbed_extreme = extreme_targeted_attack_v3(X_test_scaled, critical_feature_indices, epsilon_critical=2.7, epsilon_noise=0.5, iterations=10)

# Step 2: Evaluate the Model on Extremely Perturbed Data
y_pred_perturbed_extreme = rf_model.predict(X_test_perturbed_extreme)

# Decode predictions back to original categories
y_pred_perturbed_extreme_decoded = label_encoder.inverse_transform(y_pred_perturbed_extreme)
y_test_decoded = label_encoder.inverse_transform(y_test)

# Step 3: Evaluate Perturbed Predictions
print("Confusion Matrix After Extreme Targeted Attack (Random Forest):")
print(confusion_matrix(y_test_decoded, y_pred_perturbed_extreme_decoded))

print("\nClassification Report After Extreme Targeted Attack (Random Forest):")
print(classification_report(y_test_decoded, y_pred_perturbed_extreme_decoded))



Confusion Matrix After Extreme Targeted Attack (Random Forest):
[[    1     3     5    69   109     2   482     0     0     6]
 [    2     1     2    51   115     5   400     0     4     3]
 [   10    13    27   321   368    15  3293     5     7    30]
 [   40    16    41  1166   919    15  8729     8    14   184]
 [   17    10    30   598   973    10  4301     8    19    96]
 [    9     6    87   653  3418  2216 12433     7    12    30]
 [   19     9    34   674   748    10 35356    11    12   127]
 [    8    11    28   299   408     3  2650     2    23    64]
 [    2     0     2    31    47     0   290     1     3     2]
 [    0     0     0     8     4     0    29     0     1     2]]

Classification Report After Extreme Targeted Attack (Random Forest):
                precision    recall  f1-score   support

      Analysis       0.01      0.00      0.00       677
      Backdoor       0.01      0.00      0.00       583
           DoS       0.11      0.01      0.01      4089
      Expl

In [6]:

from art.estimators.classification import SklearnClassifier
from art.attacks.evasion import BoundaryAttack



In [66]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb

# Load your dataset
# Assume 'train_data' and 'test_data' are already loaded DataFrames with the UNSW dataset

# Prepare training and test sets
X_train = train_data.drop(columns=['attack_cat'])
y_train = train_data['attack_cat']
X_test = test_data.drop(columns=['attack_cat'])
y_test = test_data['attack_cat']

# Encode categorical features
categorical_features = ['proto', 'service', 'state']
X_train = pd.get_dummies(X_train, columns=categorical_features, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)

# Align test set columns with training set columns
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Encode the target label
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Handle class imbalance with RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Scale the features
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)  # Fit and transform on training set
X_test_scaled = scaler.transform(X_test)                     # Only transform the test set

# Initialize and train the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = xgb_model.predict(X_test_scaled)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
conf_matrix = confusion_matrix(y_test, y_pred)

# Display evaluation metrics
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(conf_matrix)


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.7403439731817519
Classification Report:
                precision    recall  f1-score   support

      Analysis       0.07      0.19      0.10       677
      Backdoor       0.02      0.24      0.03       583
           DoS       0.11      0.09      0.10      4089
      Exploits       0.77      0.41      0.54     11132
       Fuzzers       0.41      0.70      0.52      6062
       Generic       1.00      0.60      0.75     18871
        Normal       1.00      1.00      1.00     37000
Reconnaissance       0.91      0.78      0.84      3496
     Shellcode       0.27      0.97      0.43       378
         Worms       0.44      0.80      0.56        44

      accuracy                           0.74     82332
     macro avg       0.50      0.58      0.49     82332
  weighted avg       0.86      0.74      0.78     82332

Confusion Matrix:
[[  126   196   206   148     1     0     0     0     0     0]
 [  126   139   205   103     4     0     0     0     6     0]
 [  475  2465   3

In [103]:
def dynamic_targeted_attack(X, critical_feature_indices, initial_epsilon=0.1, max_iterations=20):
    """
    Perform a progressively stronger targeted attack with dynamic epsilon.
    
    Args:
    - X: Input feature matrix.
    - critical_feature_indices: Indices of the features to be perturbed.
    - initial_epsilon: Initial perturbation magnitude.
    - max_iterations: Number of iterations for iterative perturbation.
    
    Returns:
    - X_perturbed: Perturbed feature matrix.
    """
    X_perturbed = X.copy()
    epsilon = initial_epsilon

    for iteration in range(1, max_iterations + 1):
        # Increase epsilon dynamically for each iteration
        epsilon_step = epsilon * (iteration / max_iterations)
        
        for feature_idx in critical_feature_indices:
            # Perturb features with dynamic epsilon
            perturbation = np.random.choice([-epsilon_step, epsilon_step], size=X_perturbed.shape[0])
            X_perturbed[:, feature_idx] += perturbation * np.random.uniform(1, 2, size=X_perturbed.shape[0])
        
        # Optionally clip features to remain within valid bounds
        X_perturbed = np.clip(X_perturbed, X.min(axis=0), X.max(axis=0))
    
    return X_perturbed


# Define critical features based on SHAP and interaction values
critical_features = [6, 9, 7, 8, 5, 4]  # Example indices for critical features (adjust based on SHAP)
initial_epsilon = 4.0  # Starting perturbation magnitude
max_iterations = 20  # Increase iterations for greater impact

# Apply the dynamic targeted attack
X_test_perturbed = dynamic_targeted_attack(X_test_scaled, critical_features, initial_epsilon=initial_epsilon, max_iterations=max_iterations)

# Re-evaluate the model on the perturbed data
y_pred_perturbed = xgb_model.predict(X_test_perturbed)

# Confusion matrix and classification report for the perturbed data
conf_matrix_perturbed = confusion_matrix(y_test, y_pred_perturbed)
classification_report_perturbed = classification_report(y_test, y_pred_perturbed, target_names=label_encoder.classes_)

# Display the results
print("Confusion Matrix After Stronger Attack (XGBoost):")
print(conf_matrix_perturbed)
print("\nClassification Report After Stronger Targeted Attack (XGBoost):")
print(classification_report_perturbed)



Confusion Matrix After Stronger Attack (XGBoost):
[[    0   207   323   117    30     0     0     0     0     0]
 [    4   125   299   117    34     0     0     2     1     1]
 [   24  2799   559   385   226     0     0     7     6    83]
 [   92  5086  1569  2838   982     0     0    22    14   529]
 [    8  1022  1274  1095  2588     0     0    19    42    14]
 [    2  2535  3490   898  5374  6536     0     9     3    24]
 [    0     0     0     0     0     0 37000     0     0     0]
 [    1   875   623   596   527     0     0   654   128    92]
 [    0    83    96    79    81     0     0     1    38     0]
 [    0    11     6     9     5     0     0     0     1    12]]

Classification Report After Stronger Targeted Attack (XGBoost):
                precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00       677
      Backdoor       0.01      0.21      0.02       583
           DoS       0.07      0.14      0.09      4089
      Exploits       0.46    