In [None]:
# Random Forest

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
file_path = r"C:\Users\haris\Downloads\Augmented_DNP3_Parser_Training(3).xls"
data = pd.read_csv(file_path)

# Remove leading and trailing spaces from column names
data.columns = data.columns.str.strip()

# Select only numerical features, excluding unwanted columns
features = data.select_dtypes(include=['float64', 'int64']).drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'Label'], errors='ignore')
labels = data['Label'].apply(lambda x: 0 if x == 'NORMAL' else 1)  # Encode labels (0 for NORMAL, 1 for anomaly)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply Principal Component Analysis (PCA) for dimensionality reduction
pca = PCA(n_components=20, random_state=42)  # Reduce to 20 principal components
X_train_reduced = pca.fit_transform(X_train_scaled)
X_test_reduced = pca.transform(X_test_scaled)

# Plot explained variance ratio
plt.figure(figsize=(8, 5))
plt.plot(range(1, 21), pca.explained_variance_ratio_, marker='o', linestyle='dashed')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance by Principal Components')
plt.show()

# Initialize and train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42, class_weight='balanced')
rf_model.fit(X_train_reduced, y_train)

# Function to filter dataset based on intrusion rate
def filter_intrusion_rate(data, rate):
    normal_data = data[data['Label'] == 'NORMAL']
    anomalous_data = data[data['Label'] != 'NORMAL']
    num_anomalous = int(len(normal_data) * rate / 100)
    num_anomalous = min(num_anomalous, len(anomalous_data))  # Ensure we do not exceed available anomalies
    sampled_anomalous_data = anomalous_data.sample(n=num_anomalous, random_state=42)
    return pd.concat([normal_data, sampled_anomalous_data]).sample(frac=1, random_state=42)

# Define intrusion rates for analysis
intrusion_rates = [1, 3, 5, 7]
results = {}

# Evaluate model performance at different intrusion rates
for rate in intrusion_rates:
    filtered_data = filter_intrusion_rate(data, rate)
    X_train_filtered = filtered_data.select_dtypes(include=['float64', 'int64']).drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'Label'], errors='ignore')
    y_train_filtered = filtered_data['Label'].apply(lambda x: 0 if x == 'NORMAL' else 1).values

    # Split into training and testing sets
    X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(
        X_train_filtered, y_train_filtered, test_size=0.2, random_state=42
    )

    # Apply standard scaling and PCA transformation
    X_train_scaled_filtered = scaler.transform(X_train_filtered)
    X_test_scaled_filtered = scaler.transform(X_test_filtered)
    X_train_reduced_filtered = pca.transform(X_train_scaled_filtered)
    X_test_reduced_filtered = pca.transform(X_test_scaled_filtered)

    # Make predictions
    y_pred_filtered = rf_model.predict(X_test_reduced_filtered)
    y_proba_pred_filtered = rf_model.predict_proba(X_test_reduced_filtered)[:, 1]

    # Compute evaluation metrics
    accuracy_filtered = accuracy_score(y_test_filtered, y_pred_filtered)
    precision_filtered = precision_score(y_test_filtered, y_pred_filtered)
    recall_filtered = recall_score(y_test_filtered, y_pred_filtered)
    f1_filtered = f1_score(y_test_filtered, y_pred_filtered)
    roc_auc_filtered = roc_auc_score(y_test_filtered, y_proba_pred_filtered)
    conf_matrix = confusion_matrix(y_test_filtered, y_pred_filtered)

    # Store results
    results[rate] = {
        'Accuracy': accuracy_filtered,
        'Precision': precision_filtered,
        'Recall': recall_filtered,
        'F1 Score': f1_filtered,
        'ROC-AUC': roc_auc_filtered,
        'Confusion Matrix': conf_matrix
    }

    # Plot confusion matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Attack'], yticklabels=['Normal', 'Attack'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix for {rate}% Intrusion Rate')
    plt.show()

# Print evaluation results
for rate, metrics in results.items():
    print(f"\nIntrusion Rate: {rate}%")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}")
    print(f"Recall: {metrics['Recall']:.4f}")
    print(f"F1 Score: {metrics['F1 Score']:.4f}")
    print(f"ROC-AUC: {metrics['ROC-AUC']:.4f}")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])
