In [1]:
# RANDOM FOREST

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # Changed to Random Forest
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = r"C:\Users\haris\OneDrive\Desktop\Augmented_DNP3_Parser_Training.csv"  # Change this path as needed
print("Loading dataset...")
data = pd.read_csv(file_path)
print("Dataset loaded successfully!")

# Standardize column names by stripping whitespace
data.columns = data.columns.str.strip()

# Debug: Check class distribution in the dataset
print("Class distribution in dataset:")
print(data['Label'].value_counts())

# Split data into features and labels
features = data.select_dtypes(include=['float64', 'int64']).drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'Label'], errors='ignore')
labels = data['Label'].apply(lambda x: 0 if x == 'NORMAL' else 1)  # 0: Normal, 1: Anomalous

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA for dimensionality reduction (Fit PCA only on training data)
pca = PCA(n_components=20, random_state=42)  # Increase PCA components to retain more variance
X_train_reduced = pca.fit_transform(X_train_scaled)
X_test_reduced = pca.transform(X_test_scaled)

# Train Random Forest Classifier with optimized parameters
print("Training Random Forest Classifier...")
rf_model = RandomForestClassifier(
    n_estimators=100,         # Reduced the number of trees to avoid overfitting
    max_depth=5,              # Limited tree depth to prevent overfitting
    random_state=42,
    class_weight='balanced',  # Adjusted to handle class imbalance
)
rf_model.fit(X_train_reduced, y_train)
print("Random Forest Classifier trained!")

# Evaluate on the testing dataset
print("\nClassification Report for Testing Dataset:")
y_pred = rf_model.predict(X_test_reduced)
y_proba_pred = rf_model.predict_proba(X_test_reduced)[:, 1]

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba_pred)

# Print metrics with 4 decimal places
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Function to filter dataset based on intrusion rate
def filter_intrusion_rate(data, rate):
    """
    Filters the dataset to have a specified percentage of intrusion data.
    Args:
        data (pd.DataFrame): Dataset containing both normal and anomalous data.
        rate (float): Desired percentage of anomalous data (0-100).
    Returns:
        pd.DataFrame: Filtered dataset with the specified intrusion rate.
    """
    normal_data = data[data['Label'] == 'NORMAL']
    anomalous_data = data[data['Label'] != 'NORMAL']
    
    # Sample the number of anomalous instances based on the desired rate
    num_anomalous = int(len(normal_data) * rate / 100)
    num_anomalous = min(num_anomalous, len(anomalous_data))  # Avoid sampling more than available
    
    sampled_anomalous_data = anomalous_data.sample(n=num_anomalous, random_state=42)
    
    # Return the filtered dataset
    return pd.concat([normal_data, sampled_anomalous_data]).sample(frac=1, random_state=42)

# Intrusion rates to evaluate
intrusion_rates = [1, 3, 5, 7]

# Store results for each intrusion rate
results = {}

for rate in intrusion_rates:
    print(f"\nEvaluating for {rate}% intrusion rate:")
    
    # Filter dataset for both training and testing data
    filtered_data = filter_intrusion_rate(data, rate)
    
    # Debug: Check class distribution after filtering
    print("Class distribution after filtering:")
    print(filtered_data['Label'].value_counts())
    
    # Split filtered data into features and labels (Training data)
    X_train_filtered = filtered_data.select_dtypes(include=['float64', 'int64']).drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'Label'], errors='ignore')
    y_train_filtered = filtered_data['Label'].apply(lambda x: 0 if x == 'NORMAL' else 1).values
    
    # Split filtered data into features and labels (Testing data)
    X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(
        X_train_filtered, y_train_filtered, test_size=0.2, random_state=42
    )
    
    # Standardize features using the same scaler
    X_train_scaled_filtered = scaler.transform(X_train_filtered)
    X_test_scaled_filtered = scaler.transform(X_test_filtered)
    
    # Apply PCA for dimensionality reduction on filtered data
    X_train_reduced_filtered = pca.transform(X_train_scaled_filtered)
    X_test_reduced_filtered = pca.transform(X_test_scaled_filtered)
    
    # Predict using trained model on filtered test data
    y_pred_filtered = rf_model.predict(X_test_reduced_filtered)
    y_proba_pred_filtered = rf_model.predict_proba(X_test_reduced_filtered)[:, 1]
    
    # Calculate performance metrics for filtered data
    accuracy_filtered = accuracy_score(y_test_filtered, y_pred_filtered)
    precision_filtered = precision_score(y_test_filtered, y_pred_filtered)
    recall_filtered = recall_score(y_test_filtered, y_pred_filtered)
    f1_filtered = f1_score(y_test_filtered, y_pred_filtered)
    roc_auc_filtered = roc_auc_score(y_test_filtered, y_proba_pred_filtered)
    
    results[rate] = {
        'Accuracy': accuracy_filtered,
        'Precision': precision_filtered,
        'Recall': recall_filtered,
        'F1 Score': f1_filtered,
        'ROC-AUC': roc_auc_filtered
    }
    
    # Print metrics for filtered data with 4 decimal places
    print(f"Accuracy: {accuracy_filtered:.4f}")
    print(f"Precision: {precision_filtered:.4f}")
    print(f"Recall: {recall_filtered:.4f}")
    print(f"F1 Score: {f1_filtered:.4f}")
    print(f"ROC-AUC: {roc_auc_filtered:.4f}")

# Display results
print("\nSummary of Results:")
for rate, metrics in results.items():
    print(f"\nIntrusion Rate: {rate}%")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}")
    print(f"Recall: {metrics['Recall']:.4f}")
    print(f"F1 Score: {metrics['F1 Score']:.4f}")
    print(f"ROC-AUC: {metrics['ROC-AUC']:.4f}")

# Feature importances
importances = rf_model.feature_importances_
print("\nFeature Importances:")
print(importances)



Loading dataset...
Dataset loaded successfully!
Class distribution in dataset:
Label
NORMAL                 1398
REPLAY                  466
COLD_RESTART            466
DNP3_INFO               466
DNP3_ENUMERATE          466
WARM_RESTART            466
DISABLE_UNSOLICITED     466
INIT_DATA               466
STOP_APP                466
Name: count, dtype: int64
Training Random Forest Classifier...
Random Forest Classifier trained!

Classification Report for Testing Dataset:
Accuracy: 0.9932
Precision: 0.9922
Recall: 0.9987
F1 Score: 0.9954
ROC-AUC: 0.9997

Confusion Matrix:
[[256   6]
 [  1 763]]

Evaluating for 1% intrusion rate:
Class distribution after filtering:
Label
NORMAL            1398
COLD_RESTART         4
STOP_APP             2
DNP3_INFO            2
WARM_RESTART         2
INIT_DATA            1
DNP3_ENUMERATE       1
REPLAY               1
Name: count, dtype: int64
Accuracy: 0.9859
Precision: 0.2000
Recall: 1.0000
F1 Score: 0.3333
ROC-AUC: 1.0000

Evaluating for 3% intrusio