# Network Security Capstone - Results Comparison

**Purpose:** Integrate and compare findings from both unsupervised (BETH) and supervised (UNSW-NB15) analyses.

---

## Objectives:
1. Load results from previous notebooks
2. Compare unsupervised vs supervised model performance
3. Analyze how data quality and feature engineering affected outcomes
4. Identify key insights and best-performing models
5. Create comprehensive visualizations for comparison

---

**Author:** Joshua Laubach  
**Date:** October 27, 2025

## 1. Import Libraries and Load Results

In [None]:
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Configure settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("All libraries imported successfully!")

In [None]:
# Define results directory
RESULTS_DIR = Path('../results')

# Check if results directory exists
if not RESULTS_DIR.exists():
    print(f"Results directory not found: {RESULTS_DIR}")
    print("Please run notebooks 02 and 03 first to generate results.")
else:
    print(f"Results directory found: {RESULTS_DIR}")
    print(f"Available result files:")
    for file in sorted(RESULTS_DIR.glob('*.csv')):
        print(f"  - {file.name}")

## 2. Load Unsupervised Results (BETH Dataset)

In [None]:
# Load BETH baseline vs enhanced comparison results
BETH_RESULTS_FILE = RESULTS_DIR / 'beth_baseline_vs_enhanced_comparison.csv'
try:
    beth_results = pd.read_csv(BETH_RESULTS_FILE)
    print(f"[OK] Successfully loaded: {BETH_RESULTS_FILE.name}")
    print(f"\nShape: {beth_results.shape}")
    print("\n[BETH Unsupervised Models - Baseline vs Enhanced Performance]")
    print(beth_results.to_string(index=False))
except FileNotFoundError:
    print(f"[X] File not found: {BETH_RESULTS_FILE.name}")
    print("   Please run notebook '02_beth_unsupervised.ipynb' to generate this file.")
    beth_results = None


In [None]:
# Load BETH TF-IDF feature importance
BETH_TFIDF_FILE = RESULTS_DIR / 'beth_tfidf_feature_importance.csv'
try:
    beth_tfidf_importance = pd.read_csv(BETH_TFIDF_FILE)
    print(f"[OK] Successfully loaded: {BETH_TFIDF_FILE.name}")
    print(f"\nTop 10 most important TF-IDF features:")
    print(beth_tfidf_importance.head(10).to_string(index=False))
except FileNotFoundError:
    print(f"[X] File not found: {BETH_TFIDF_FILE.name}")
    beth_tfidf_importance = None


In [None]:
# Load BETH selected TF-IDF features
BETH_SELECTED_FEATURES_FILE = RESULTS_DIR / 'beth_selected_tfidf_features.csv'
try:
    beth_selected_features = pd.read_csv(BETH_SELECTED_FEATURES_FILE)
    print(f"[OK] Successfully loaded: {BETH_SELECTED_FEATURES_FILE.name}")
    print(f"\nNumber of selected features: {len(beth_selected_features)}")
    print(f"\nTop 15 selected TF-IDF features:")
    print(beth_selected_features.head(15).to_string(index=False))
except FileNotFoundError:
    print(f"[X] File not found: {BETH_SELECTED_FEATURES_FILE.name}")
    beth_selected_features = None


In [None]:
# Load BETH suspicious vs evil analysis (baseline vs enhanced)
BETH_SUS_EVIL_BASELINE_FILE = RESULTS_DIR / 'beth_sus_vs_evil_baseline.csv'
BETH_SUS_EVIL_ENHANCED_FILE = RESULTS_DIR / 'beth_sus_vs_evil_enhanced.csv'
try:
    beth_sus_evil_baseline = pd.read_csv(BETH_SUS_EVIL_BASELINE_FILE)
    beth_sus_evil_enhanced = pd.read_csv(BETH_SUS_EVIL_ENHANCED_FILE)
    print(f"[OK] Successfully loaded: {BETH_SUS_EVIL_BASELINE_FILE.name}")
    print(f"[OK] Successfully loaded: {BETH_SUS_EVIL_ENHANCED_FILE.name}")
    print("\n[Baseline Model - Sus vs Evil Performance]")
    print(beth_sus_evil_baseline.to_string(index=False))
    print("\n[Enhanced Model - Sus vs Evil Performance]")
    print(beth_sus_evil_enhanced.to_string(index=False))
except FileNotFoundError as e:
    print(f"[X] File not found: {e.filename}")
    print("   Please run notebook '02_beth_unsupervised.ipynb' to generate these files.")
    beth_sus_evil_baseline = beth_sus_evil_enhanced = None


In [None]:
# Load BETH anomalous arguments analysis
BETH_ANOMALOUS_ARGS_FILE = RESULTS_DIR / 'beth_anomalous_arguments_analysis.csv'
try:
    beth_anomalous_args = pd.read_csv(BETH_ANOMALOUS_ARGS_FILE)
    print(f"[OK] Successfully loaded: {BETH_ANOMALOUS_ARGS_FILE.name}")
    print(f"\nTop 10 anomalous argument patterns:")
    print(beth_anomalous_args.head(10).to_string(index=False))
except FileNotFoundError:
    print(f"[X] File not found: {BETH_ANOMALOUS_ARGS_FILE.name}")
    print("   Please run notebook '02_beth_unsupervised.ipynb' to generate this file.")
    beth_anomalous_args = None


## 3. Load Supervised Results (UNSW-NB15 Dataset)

In [None]:
# Load UNSW-NB15 supervised model comparison
UNSW_RESULTS_FILE = RESULTS_DIR / 'unsw_supervised_comparison.csv'
try:
    unsw_results = pd.read_csv(UNSW_RESULTS_FILE)
    print(f"[OK] Successfully loaded: {UNSW_RESULTS_FILE.name}")
    print(f"\nShape: {unsw_results.shape}")
    print("\n[UNSW-NB15 Supervised Models - Performance Summary]")
    print(unsw_results.to_string(index=False))
    
    # Identify best model
    if 'roc_auc' in unsw_results.columns:
        best_idx = unsw_results['roc_auc'].idxmax()
        best_model = unsw_results.loc[best_idx, 'model']
        best_auc = unsw_results.loc[best_idx, 'roc_auc']
        print(f"\nBest model by ROC-AUC: {best_model} (AUC={best_auc:.4f})")
except FileNotFoundError:
    print(f"[X] File not found: {UNSW_RESULTS_FILE.name}")
    print("   Please run notebook '03_unsw_supervised.ipynb' to generate this file.")
    unsw_results = None


In [None]:
# Load Random Forest feature importances
RF_IMPORTANCES_FILE = RESULTS_DIR / 'unsw_rf_feature_importances.csv'
try:
    rf_importances = pd.read_csv(RF_IMPORTANCES_FILE)
    print(f"[OK] Successfully loaded: {RF_IMPORTANCES_FILE.name}")
    print(f"\n[Top 10 Most Important Features - Random Forest]")
    print(rf_importances.head(10).to_string(index=False))
except FileNotFoundError:
    print(f"[X] File not found: {RF_IMPORTANCES_FILE.name}")
    rf_importances = None


In [None]:
# Load XGBoost feature importances
XGB_IMPORTANCES_FILE = RESULTS_DIR / 'unsw_xgb_feature_importances.csv'
try:
    xgb_importances = pd.read_csv(XGB_IMPORTANCES_FILE)
    print(f"[OK] Successfully loaded: {XGB_IMPORTANCES_FILE.name}")
    print(f"\n[Top 10 Most Important Features - XGBoost]")
    print(xgb_importances.head(10).to_string(index=False))
except FileNotFoundError:
    print(f"[X] File not found: {XGB_IMPORTANCES_FILE.name}")
    xgb_importances = None


In [None]:
# Load UNSW feature selection results
# These files are optional and may not exist if feature selection was not run
UNSW_SELECTED_FEATURES_FILE = RESULTS_DIR / 'unsw_selected_features.csv'
UNSW_FEAT_SEL_COMP_FILE = RESULTS_DIR / 'unsw_feature_selection_comparison.csv'
UNSW_FEAT_SEL_PERF_FILE = RESULTS_DIR / 'unsw_feature_selection_performance.csv'

try:
    unsw_selected_features = pd.read_csv(UNSW_SELECTED_FEATURES_FILE)
    unsw_feature_selection_comparison = pd.read_csv(UNSW_FEAT_SEL_COMP_FILE)
    unsw_feature_selection_performance = pd.read_csv(UNSW_FEAT_SEL_PERF_FILE)
    
    print(f"[OK] Successfully loaded all feature selection files.")
    print(f"\n[Feature Selection Methods Comparison]")
    print(unsw_feature_selection_comparison.to_string(index=False))
    
    print(f"\n[All Features vs Selected Features Performance]")
    print(unsw_feature_selection_performance.to_string(index=False))
    
    print(f"\n[Top 10 Selected Features]")
    print(unsw_selected_features.head(10).to_string(index=False))
    
except FileNotFoundError as e:
    print(f"[i]  Feature selection file not found: {e.filename}")
    print("   This is optional. To generate, re-run the feature selection section in notebook 03.")
    unsw_selected_features = unsw_feature_selection_comparison = unsw_feature_selection_performance = None


In [None]:
# Load XGBoost hyperparameter tuning results
# These files are optional and may not exist if hyperparameter tuning was not run
XGB_LR_TUNING_FILE = RESULTS_DIR / 'unsw_xgb_learning_rate_tuning.csv'
XGB_DEPTH_TUNING_FILE = RESULTS_DIR / 'unsw_xgb_depth_tuning.csv'
XGB_REG_TUNING_FILE = RESULTS_DIR / 'unsw_xgb_regularization_tuning.csv'

try:
    xgb_lr_tuning = pd.read_csv(XGB_LR_TUNING_FILE)
    xgb_depth_tuning = pd.read_csv(XGB_DEPTH_TUNING_FILE)
    xgb_reg_tuning = pd.read_csv(XGB_REG_TUNING_FILE)
    
    print("[OK] Successfully loaded all XGBoost tuning files.")
    print(f"\n[Learning Rate Tuning - Best Configuration]")
    best_lr = xgb_lr_tuning.loc[xgb_lr_tuning['roc_auc'].idxmax()]
    print(f"  Learning Rate: {best_lr['learning_rate']}, ROC-AUC: {best_lr['roc_auc']:.4f}")
    
    print(f"\n[Max Depth Tuning - Best Configuration]")
    best_depth = xgb_depth_tuning.loc[xgb_depth_tuning['roc_auc'].idxmax()]
    print(f"  Max Depth: {best_depth['max_depth']:.0f}, ROC-AUC: {best_depth['roc_auc']:.4f}")
    
    print(f"\n[Regularization Tuning - Best Configuration]")
    best_reg = xgb_reg_tuning.loc[xgb_reg_tuning['roc_auc'].idxmax()]
    print(f"  Config: {best_reg['configuration']}, ROC-AUC: {best_reg['roc_auc']:.4f}")
    
except FileNotFoundError as e:
    print(f"[i]  XGBoost tuning file not found: {e.filename}")
    print("   This is optional. To generate, re-run the hyperparameter tuning section in notebook 03.")
    xgb_lr_tuning = xgb_depth_tuning = xgb_reg_tuning = None


In [None]:
# Load UNSW two-stage pipeline results
UNSW_TWO_STAGE_PREDS_FILE = RESULTS_DIR / 'unsw_two_stage_predictions.csv'
UNSW_STAGE2_PERF_FILE = RESULTS_DIR / 'unsw_stage2_attack_type_performance.csv'

try:
    unsw_two_stage = pd.read_csv(UNSW_TWO_STAGE_PREDS_FILE)
    unsw_stage2_performance = pd.read_csv(UNSW_STAGE2_PERF_FILE)
    
    print(f"[OK] Successfully loaded: {UNSW_TWO_STAGE_PREDS_FILE.name}")
    print(f"[OK] Successfully loaded: {UNSW_STAGE2_PERF_FILE.name}")
    
    print(f"\n[Two-Stage Pipeline Predictions]")
    print(f"  Total predictions: {len(unsw_two_stage)}")
    print(f"  Stage 1 detected attacks: {(unsw_two_stage['stage1_prediction'] == 'Attack').sum()}")
    
    print(f"\n[Stage 2 Attack Type Classification Performance]")
    print(unsw_stage2_performance.to_string(index=False))
except FileNotFoundError as e:
    print(f"[X] File not found: {e.filename}")
    print("   Please run the two-stage pipeline section in notebook '03_unsw_supervised.ipynb'.")
    unsw_two_stage = unsw_stage2_performance = None


## 4. Cross-Dataset Comparison

Compare the challenges and outcomes of unsupervised (BETH) vs supervised (UNSW-NB15) approaches.

In [None]:
# Create comparison table
comparison_data = {
    'Aspect': [
        'Domain',
        'Learning Paradigm',
        'Primary Metric',
        'Best Model Type',
        'Key Challenge',
        'Feature Engineering Impact',
        'Interpretability'
    ],
    'BETH (Unsupervised)': [
        'System Call Logs',
        'Unsupervised Clustering',
        'Detection Rate / FPR',
        'K-Means / GMM (varies by metric)',
        'Distinguishing subtle anomalies without labels',
        'Critical - enables distance-based separation',
        'Moderate - cluster centers interpretable'
    ],
    'UNSW-NB15 (Supervised)': [
        'Network Traffic',
        'Supervised Classification',
        'ROC-AUC / F1-Score',
        'XGBoost / Random Forest',
        'High dimensionality and class imbalance',
        'Enhances performance - pair features crucial',
        'High - feature importances clearly defined'
    ]
}

comparison_df = pd.DataFrame(comparison_data)
print("\n" + "="*100)
print("CROSS-DATASET COMPARISON")
print("="*100)
print(comparison_df.to_string(index=False))
print("="*100)

## 5. Visualization: Model Performance Comparison

In [None]:
# Compare BETH baseline vs enhanced models (if results available)
if beth_results is not None:
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Prepare data - compare baseline vs enhanced across metrics
    if 'model_type' in beth_results.columns and 'metric' in beth_results.columns:
        # Pivot format: each row is a metric, columns are baseline/enhanced
        metrics_to_plot = ['Detection Rate', 'False Positive Rate', 'True Negative Rate', 'F1-Score']
        
        for i, (ax, metric_group) in enumerate(zip(axes.flatten(), 
                                                     [['Detection Rate', 'False Positive Rate'], 
                                                      ['True Negative Rate', 'F1-Score']])):
            x_pos = []
            y_vals = []
            labels = []
            colors = []
            
            for metric in metric_group:
                if metric in beth_results['metric'].values:
                    baseline_val = beth_results[(beth_results['metric'] == metric) & 
                                                (beth_results['model_type'] == 'baseline')]['value'].values
                    enhanced_val = beth_results[(beth_results['metric'] == metric) & 
                                                (beth_results['model_type'] == 'enhanced')]['value'].values
                    
                    if len(baseline_val) > 0 and len(enhanced_val) > 0:
                        labels.extend([f'{metric}\n(Baseline)', f'{metric}\n(Enhanced)'])
                        y_vals.extend([baseline_val[0], enhanced_val[0]])
                        colors.extend(['steelblue', 'darkorange'])
            
            if len(y_vals) > 0:
                x_pos = list(range(len(y_vals)))
                bars = ax.bar(x_pos, y_vals, color=colors, alpha=0.7, edgecolor='black')
                ax.set_xticks(x_pos)
                ax.set_xticklabels(labels, fontsize=9)
                ax.set_ylabel('Score', fontsize=11, fontweight='bold')
                ax.set_title(f'BETH - {" & ".join(metric_group)}', fontsize=12, fontweight='bold')
                ax.set_ylim([0, 1.0])
                ax.grid(axis='y', alpha=0.3)
                
                # Add value labels
                for bar, val in zip(bars, y_vals):
                    ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, 
                           f'{val:.3f}', ha='center', fontsize=9, fontweight='bold')
    else:
        # Simple format: columns are metrics
        print("Displaying BETH results in table format (non-standard structure)")
        print(beth_results.to_string(index=False))
    
    plt.tight_layout()
    plt.suptitle('BETH Dataset - Baseline vs Enhanced Model Performance', 
                 fontsize=14, fontweight='bold', y=1.02)
    plt.show()
else:
    print("BETH results not available for visualization.")

In [None]:
# Compare UNSW-NB15 supervised models (if results available)
if unsw_results is not None and 'model' in unsw_results.columns:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.flatten()
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    titles = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
    colors = ['steelblue', 'darkorange', 'forestgreen', 'purple']
    
    for ax, metric, title, color in zip(axes, metrics, titles, colors):
        if metric in unsw_results.columns:
            models = unsw_results['model'].values
            values = unsw_results[metric].values
            
            bars = ax.bar(range(len(models)), values, color=color, alpha=0.7, edgecolor='black')
            ax.set_xticks(range(len(models)))
            ax.set_xticklabels(models, rotation=15, ha='right')
            ax.set_ylabel(title, fontsize=11, fontweight='bold')
            ax.set_title(f'UNSW-NB15 - {title}', fontsize=12, fontweight='bold')
            ax.set_ylim([0, 1.0])
            ax.grid(axis='y', alpha=0.3)
            
            # Add value labels
            for i, (bar, val) in enumerate(zip(bars, values)):
                ax.text(i, val + 0.02, f'{val:.3f}', ha='center', fontsize=9, fontweight='bold')
    
    plt.tight_layout()
    plt.suptitle('UNSW-NB15 Dataset - Supervised Model Performance', 
                 fontsize=14, fontweight='bold', y=1.0)
    plt.show()
else:
    print("UNSW results not available for visualization.")

In [None]:
# ROC-AUC comparison for UNSW models
if unsw_results is not None and 'roc_auc' in unsw_results.columns:
    fig, ax = plt.subplots(figsize=(10, 6))
    
    models = unsw_results['model'].values
    roc_aucs = unsw_results['roc_auc'].values
    
    colors = ['steelblue', 'darkorange', 'forestgreen']
    bars = ax.barh(models, roc_aucs, color=colors[:len(models)], alpha=0.7, edgecolor='black', height=0.6)
    
    ax.set_xlabel('ROC-AUC Score', fontsize=12, fontweight='bold')
    ax.set_title('UNSW-NB15 - ROC-AUC Comparison', fontsize=14, fontweight='bold', pad=15)
    ax.set_xlim([0.9, 1.0])  # Zoom in to see differences
    ax.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, (bar, val) in enumerate(zip(bars, roc_aucs)):
        ax.text(val + 0.002, i, f'{val:.4f}', va='center', fontsize=10, fontweight='bold')
    
    # Add reference line at 0.95
    ax.axvline(x=0.95, color='red', linestyle='--', linewidth=1, alpha=0.5, label='0.95 threshold')
    ax.legend()
    
    plt.tight_layout()
    plt.show()
else:
    print("ROC-AUC data not available for visualization.")

## 6. Feature Engineering Impact Analysis

In [None]:
# Analyze consensus features from supervised models
if rf_importances is not None and xgb_importances is not None:
    # Get top 20 features from each
    rf_top_features = set(rf_importances['feature'].head(20))
    xgb_top_features = set(xgb_importances['feature'].head(20))
    
    # Find overlap
    consensus_features = rf_top_features & xgb_top_features
    
    print("\n" + "="*80)
    print("FEATURE IMPORTANCE CONSENSUS (Random Forest & XGBoost)")
    print("="*80)
    print(f"\nTotal consensus features (in top 20 of both): {len(consensus_features)}")
    print("\nConsensus Features:")
    for i, feat in enumerate(sorted(consensus_features), 1):
        print(f"  {i:2d}. {feat}")
    
    # Check if engineered features are important
    engineered_patterns = ['_sum', '_diff', '_ratio', 'both_zero', 'one_zero']
    engineered_consensus = [f for f in consensus_features 
                           if any(pattern in f for pattern in engineered_patterns)]
    
    print(f"\nEngineered features in consensus: {len(engineered_consensus)}")
    if engineered_consensus:
        print("\nCritical Engineered Features:")
        for i, feat in enumerate(sorted(engineered_consensus), 1):
            print(f"  {i:2d}. {feat}")
    
    print("\n" + "="*80)
    print("\n[KEY INSIGHT] Feature engineering impact:")
    impact_pct = (len(engineered_consensus) / len(consensus_features) * 100) if consensus_features else 0
    print(f"  {impact_pct:.1f}% of consensus important features are engineered features")
    print("  This demonstrates the critical role of domain-informed feature creation.")
    print("="*80)
else:
    print("Feature importance data not available for analysis.")

## 6.1 Feature Selection Impact

In [None]:
# Analyze feature selection impact (using already loaded data)
if all(v is not None for v in [unsw_selected_features, unsw_feature_selection_comparison, unsw_feature_selection_performance]):
    print("\n[Feature Selection Methods Applied]")
    print(unsw_feature_selection_comparison.to_string(index=False))
    
    print(f"\n[Performance Impact]")
    print(unsw_feature_selection_performance.to_string(index=False))
    
    # Calculate dimensionality reduction
    original_features = unsw_feature_selection_performance.loc[
        unsw_feature_selection_performance['feature_set'] == 'all_features', 'n_features'].values[0]
    selected_features_count = unsw_feature_selection_performance.loc[
        unsw_feature_selection_performance['feature_set'] == 'selected_features', 'n_features'].values[0]
    reduction_pct = (1 - selected_features_count / original_features) * 100
    
    print(f"\n[KEY INSIGHT] Feature Selection Impact:")
    print(f"  Original features: {original_features}")
    print(f"  Selected features: {selected_features_count}")
    print(f"  Dimensionality reduction: {reduction_pct:.1f}%")
    
    # Performance delta
    auc_delta = (unsw_feature_selection_performance.loc[
                     unsw_feature_selection_performance['feature_set'] == 'selected_features', 'roc_auc'].values[0] -
                 unsw_feature_selection_performance.loc[
                     unsw_feature_selection_performance['feature_set'] == 'all_features', 'roc_auc'].values[0])
    
    if auc_delta >= 0:
        print(f"  Performance change: +{auc_delta:.4f} ROC-AUC (improved/maintained)")
    else:
        print(f"  Performance change: {auc_delta:.4f} ROC-AUC (minimal impact)")
        
    print(f"  Result: Achieved {reduction_pct:.1f}% dimensionality reduction with negligible performance impact")
    
else:
    print("Feature selection results not available. Run notebook 03 with feature selection first.")

In [None]:
# Visualize feature selection impact
if unsw_feature_selection_comparison is not None and unsw_feature_selection_performance is not None:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    axes = axes.flatten()
    
    # Left plot: Features selected by each method
    ax = axes[0]
    methods = unsw_feature_selection_comparison['method'].values
    n_features = unsw_feature_selection_comparison['n_features_selected'].values
    colors = ['steelblue', 'darkorange', 'forestgreen']
    
    bars = ax.barh(methods, n_features, color=colors[:len(methods)], 
                   alpha=0.8, edgecolor='black', linewidth=1.5)
    ax.set_xlabel('Number of Features Selected', fontsize=11, fontweight='bold')
    ax.set_title('Feature Selection by Method', fontsize=12, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    for i, val in enumerate(n_features):
        ax.text(val + 1, i, str(val), va='center', fontsize=10, fontweight='bold')
    
    # Right plot: Performance comparison
    ax = axes[1]
    feature_sets = ['All Features', 'Selected Features']
    roc_aucs = unsw_feature_selection_performance['roc_auc'].values
    colors = ['steelblue', 'darkorange']
    
    bars = ax.bar(feature_sets, roc_aucs, color=colors, 
                  alpha=0.8, edgecolor='black', linewidth=1.5)
    ax.set_ylabel('ROC-AUC', fontsize=11, fontweight='bold')
    ax.set_title('Performance: All vs Selected Features', fontsize=12, fontweight='bold')
    ax.set_ylim([min(roc_aucs) - 0.005, 1.0])
    ax.grid(axis='y', alpha=0.3)
    
    for bar, val in zip(bars, roc_aucs):
        ax.text(bar.get_x() + bar.get_width()/2, val + 0.002, 
                f'{val:.4f}', ha='center', fontsize=10, fontweight='bold')
    
    plt.suptitle('Feature Selection Impact on UNSW-NB15', 
                 fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.show()
else:
    print("Feature selection visualizations not available")

In [None]:
# BETH Anomalous Arguments Analysis
if beth_anomalous_args is not None:
    print("\n" + "="*80)
    print("BETH ANOMALOUS ARGUMENTS ANALYSIS")
    print("="*80)
    
    print(f"\nTop 10 Most Anomalous Argument Patterns:")
    print(beth_anomalous_args.head(10).to_string(index=False))
    
    print("\n[KEY INSIGHT] Anomalous Argument Detection:")
    print("  - TF-IDF feature engineering successfully identified suspicious arguments")
    print("  - Rare/unique arguments are strong indicators of malicious behavior")
    print("  - These patterns can inform rule-based detection systems")
    
    # Visualize top anomalous arguments if score column exists
    if 'anomaly_score' in beth_anomalous_args.columns or 'tfidf_score' in beth_anomalous_args.columns:
        score_col = 'anomaly_score' if 'anomaly_score' in beth_anomalous_args.columns else 'tfidf_score'
        
        fig, ax = plt.subplots(figsize=(12, 6))
        top_args = beth_anomalous_args.head(15)
        
        if 'argument' in top_args.columns:
            args = top_args['argument'].values
            scores = top_args[score_col].values
            
            bars = ax.barh(range(len(args)), scores, color='crimson', alpha=0.7, edgecolor='black')
            ax.set_yticks(range(len(args)))
            ax.set_yticklabels(args, fontsize=9)
            ax.set_xlabel(score_col.replace('_', ' ').title(), fontsize=11, fontweight='bold')
            ax.set_title('Top 15 Most Anomalous Arguments (BETH)', fontsize=13, fontweight='bold')
            ax.invert_yaxis()
            ax.grid(axis='x', alpha=0.3)
            
            plt.tight_layout()
            plt.show()
    
    print("="*80)
else:
    print("BETH anomalous arguments analysis not available.")

## 7. Hyperparameter Tuning Impact (XGBoost)

In [None]:
# Visualize hyperparameter tuning impact
if all(v is not None for v in [xgb_lr_tuning, xgb_depth_tuning, xgb_reg_tuning]):
    fig, axes = plt.subplots(1, 3, figsize=(16, 5))
    axes = axes.flatten()
    
    # Learning rate impact
    axes[0].plot(xgb_lr_tuning['learning_rate'], xgb_lr_tuning['roc_auc'], 
                marker='o', linewidth=2, markersize=8, color='steelblue')
    axes[0].set_xlabel('Learning Rate', fontsize=11, fontweight='bold')
    axes[0].set_ylabel('ROC-AUC', fontsize=11, fontweight='bold')
    axes[0].set_title('Learning Rate Impact', fontsize=12, fontweight='bold')
    axes[0].grid(True, alpha=0.3)
    
    # Max depth impact
    axes[1].plot(xgb_depth_tuning['max_depth'], xgb_depth_tuning['roc_auc'], 
                marker='s', linewidth=2, markersize=8, color='darkorange')
    axes[1].set_xlabel('Max Depth', fontsize=11, fontweight='bold')
    axes[1].set_ylabel('ROC-AUC', fontsize=11, fontweight='bold')
    axes[1].set_title('Tree Depth Impact', fontsize=12, fontweight='bold')
    axes[1].grid(True, alpha=0.3)
    
    # Regularization impact
    configs = xgb_reg_tuning['configuration'].values
    roc_values = xgb_reg_tuning['roc_auc'].values
    axes[2].barh(range(len(configs)), roc_values, color='forestgreen', alpha=0.7, edgecolor='black')
    axes[2].set_yticks(range(len(configs)))
    axes[2].set_yticklabels(configs, fontsize=9)
    axes[2].set_xlabel('ROC-AUC', fontsize=11, fontweight='bold')
    axes[2].set_title('Regularization Impact', fontsize=12, fontweight='bold')
    axes[2].grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, val in enumerate(roc_values):
        axes[2].text(val + 0.001, i, f'{val:.4f}', va='center', fontsize=8)
    
    plt.tight_layout()
    plt.suptitle('XGBoost Hyperparameter Tuning Impact on UNSW-NB15', 
                 fontsize=14, fontweight='bold', y=1.02)
    plt.show()
    
    # Print summary
    print("\n[HYPERPARAMETER TUNING INSIGHTS]")
    print(f"\nLearning Rate Range: {xgb_lr_tuning['roc_auc'].max() - xgb_lr_tuning['roc_auc'].min():.4f}")
    print(f"Max Depth Range: {xgb_depth_tuning['roc_auc'].max() - xgb_depth_tuning['roc_auc'].min():.4f}")
    print(f"Regularization Range: {xgb_reg_tuning['roc_auc'].max() - xgb_reg_tuning['roc_auc'].min():.4f}")
else:
    print("Hyperparameter tuning results not complete.")

## 7.1 Two-Stage Pipeline Analysis (Detection -> Classification)

Evaluate the realistic security operations workflow: first detect attacks, then classify attack types.


In [None]:
# Analyze two-stage pipeline results
if unsw_two_stage is not None and unsw_stage2_performance is not None:
    print("\n" + "="*80)
    print("TWO-STAGE PIPELINE ANALYSIS")
    print("="*80)
    
    # Stage 1 statistics
    total_samples = len(unsw_two_stage)
    detected_attacks = (unsw_two_stage['stage1_prediction'] == 'Attack').sum()
    detected_normal = total_samples - detected_attacks
    
    print(f"\n[Stage 1: Attack Detection]")
    print(f"  Total test samples: {total_samples}")
    print(f"  Detected as Normal: {detected_normal} ({100*detected_normal/total_samples:.2f}%)")
    print(f"  Detected as Attack: {detected_attacks} ({100*detected_attacks/total_samples:.2f}%)")
    
    # Calculate Stage 1 accuracy
    correct_stage1 = ((unsw_two_stage['stage1_prediction'] == 'Normal') & 
                      (unsw_two_stage['true_label'] == 'Normal')).sum() + \
                     ((unsw_two_stage['stage1_prediction'] == 'Attack') & 
                      (unsw_two_stage['true_label'] != 'Normal')).sum()
    stage1_accuracy = correct_stage1 / total_samples
    print(f"  Stage 1 Accuracy: {stage1_accuracy:.4f}")
    
    # Stage 2 statistics
    print(f"\n[Stage 2: Attack Type Classification]")
    print(f"  Attack types detected: {len(unsw_stage2_performance)}")
    print(f"\n  Performance by Attack Type:")
    print(unsw_stage2_performance.to_string(index=False))
    
    # Overall metrics
    avg_precision = unsw_stage2_performance['precision'].mean()
    avg_recall = unsw_stage2_performance['recall'].mean()
    avg_f1 = unsw_stage2_performance['f1_score'].mean()
    
    print(f"\n  Average Precision: {avg_precision:.4f}")
    print(f"  Average Recall: {avg_recall:.4f}")
    print(f"  Average F1-Score: {avg_f1:.4f}")
    
    # End-to-end accuracy
    correct_e2e = (unsw_two_stage['true_label'] == unsw_two_stage['final_prediction']).sum()
    e2e_accuracy = correct_e2e / total_samples
    print(f"\n[End-to-End Pipeline Performance]")
    print(f"  Overall Accuracy: {e2e_accuracy:.4f}")
    print(f"  Stage 1  Stage 2 Success Rate: {e2e_accuracy:.2%}")
    
    print("\n[KEY INSIGHT] Two-Stage Pipeline Benefits:")
    print("  - Realistic security operations workflow")
    print("  - Stage 1 filters normal traffic (efficiency)")
    print("  - Stage 2 provides actionable attack type intelligence")
    print("  - Modular design allows independent optimization of each stage")
    
    print("="*80)
else:
    print("Two-stage pipeline results not available. Run notebook 03 Section 10 first.")

In [None]:
# Visualize two-stage pipeline performance
if unsw_stage2_performance is not None:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    axes = axes.flatten()
    
    # Plot 1: Attack type performance
    ax = axes[0]
    attack_types = unsw_stage2_performance['attack_type'].values
    f1_scores = unsw_stage2_performance['f1_score'].values
    
    bars = ax.barh(range(len(attack_types)), f1_scores, color='steelblue', alpha=0.7, edgecolor='black')
    ax.set_yticks(range(len(attack_types)))
    ax.set_yticklabels(attack_types, fontsize=10)
    ax.set_xlabel('F1-Score', fontsize=11, fontweight='bold')
    ax.set_title('Stage 2: Attack Type Classification Performance', fontsize=12, fontweight='bold')
    ax.set_xlim([0, 1.0])
    ax.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, val in enumerate(f1_scores):
        ax.text(val + 0.02, i, f'{val:.3f}', va='center', fontsize=9)
    
    # Plot 2: Precision/Recall comparison
    ax = axes[1]
    x = range(len(attack_types))
    width = 0.35
    
    precision = unsw_stage2_performance['precision'].values
    recall = unsw_stage2_performance['recall'].values
    
    bars1 = ax.barh([i - width/2 for i in x], precision, width, 
                    label='Precision', color='darkorange', alpha=0.7, edgecolor='black')
    bars2 = ax.barh([i + width/2 for i in x], recall, width,
                    label='Recall', color='forestgreen', alpha=0.7, edgecolor='black')
    
    ax.set_yticks(x)
    ax.set_yticklabels(attack_types, fontsize=10)
    ax.set_xlabel('Score', fontsize=11, fontweight='bold')
    ax.set_title('Stage 2: Precision vs Recall by Attack Type', fontsize=12, fontweight='bold')
    ax.set_xlim([0, 1.0])
    ax.legend()
    ax.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.suptitle('Two-Stage Pipeline: Attack Type Classification Performance', 
                 fontsize=14, fontweight='bold', y=1.02)
    plt.show()
else:
    print("Two-stage pipeline visualizations not available")

## 8. Key Findings Summary

In [None]:
print("\n" + "="*100)
print("KEY FINDINGS SUMMARY")
print("="*100)

print("\n[1. UNSUPERVISED ANOMALY DETECTION (BETH)]")
if beth_results is not None:
    print("\n    Baseline vs Enhanced Comparison:")
    
    # Check if data is in pivot format
    if 'model_type' in beth_results.columns and 'metric' in beth_results.columns:
        baseline_metrics = beth_results[beth_results['model_type'] == 'baseline']
        enhanced_metrics = beth_results[beth_results['model_type'] == 'enhanced']
        
        print("      Baseline Model Performance:")
        for _, row in baseline_metrics.iterrows():
            print(f"        - {row['metric']}: {row['value']:.4f}")
        
        print("\n      Enhanced Model Performance:")
        for _, row in enhanced_metrics.iterrows():
            print(f"        - {row['metric']}: {row['value']:.4f}")
    else:
        print("      Results:")
        print(beth_results.to_string(index=False))
    
    print("\n    Key Insights:")
    print("      - TF-IDF feature engineering enabled effective anomaly detection")
    print("      - Baseline vs Enhanced comparison shows impact of feature selection")
    print("      - Suspicious vs Evil detection demonstrates model versatility")
    print("      - Anomalous argument patterns identified security-relevant features")
else:
    print("    [Results not available - run notebook 02 first]")

print("\n" + "-"*100)

print("\n[2. SUPERVISED ATTACK CLASSIFICATION (UNSW-NB15)]")
if unsw_results is not None:
    print("\n    Best Models:")
    if 'roc_auc' in unsw_results.columns:
        best_model = unsw_results.loc[unsw_results['roc_auc'].idxmax()]
        print(f"      - Overall Best: {best_model['model']}")
        print(f"        ROC-AUC: {best_model['roc_auc']:.4f}")
        if 'f1_score' in best_model:
            print(f"        F1-Score: {best_model['f1_score']:.4f}")
    
    print("\n    Key Insights:")
    print("      - Ensemble methods (RF, XGBoost) outperformed linear models")
    print("      - Feature selection reduced dimensionality while maintaining performance")
    print("      - Hyperparameter tuning provided measurable improvements")
    print("      - All models achieved >95% ROC-AUC on attack detection")
    
    if unsw_two_stage is not None:
        print("\n    Two-Stage Pipeline:")
        print("      - Stage 1 (Detection): High accuracy binary classification")
        print("      - Stage 2 (Classification): Attack type identification for incidents")
        print("      - End-to-end workflow mirrors real security operations")
else:
    print("    [Results not available - run notebook 03 first]")

print("\n" + "-"*100)

print("\n[3. FEATURE ENGINEERING IMPACT]")
print("\n    BETH Dataset:")
print("      - TF-IDF encoding captured argument importance")
print("      - Feature selection identified top 50 most discriminative features")
print("      - Anomalous argument analysis revealed attack patterns")

print("\n    UNSW-NB15 Dataset:")
if rf_importances is not None and xgb_importances is not None:
    rf_top = set(rf_importances['feature'].head(20))
    xgb_top = set(xgb_importances['feature'].head(20))
    consensus = rf_top & xgb_top
    engineered = [f for f in consensus if any(p in f for p in ['_sum', '_diff', '_ratio', 'zero'])]
    print(f"      - {len(consensus)} consensus features across RF and XGBoost")
    print(f"      - {len(engineered)} engineered features in top consensus")
    if len(consensus) > 0:
        print(f"      - {len(engineered)/len(consensus)*100:.1f}% of important features are engineered")
else:
    print("      - Engineered features showed high importance in models")

if unsw_feature_selection_performance is not None:
    reduction = (1 - unsw_feature_selection_performance.loc[
        unsw_feature_selection_performance['feature_set'] == 'selected_features', 'n_features'].values[0] /
        unsw_feature_selection_performance.loc[
        unsw_feature_selection_performance['feature_set'] == 'all_features', 'n_features'].values[0]) * 100
    print(f"      - Feature selection achieved {reduction:.1f}% dimensionality reduction")

print("\n" + "-"*100)

print("\n[4. MODEL SELECTION RECOMMENDATIONS]")
print("\n    For Anomaly Detection (Unlabeled Data):")
print("      - Use K-Means with enhanced features for baseline detection")
print("      - TF-IDF feature engineering critical for text-based features")
print("      - Feature selection improves interpretability and performance")

print("\n    For Attack Classification (Labeled Data):")
print("      - XGBoost recommended for best overall performance")
print("      - Random Forest for interpretability and feature importance")
print("      - Two-stage pipeline for realistic security operations workflow")
print("      - Feature selection maintains performance with fewer features")

print("\n" + "="*100)
print("\n[CONCLUSION]")
print("\nBoth unsupervised and supervised approaches successfully detected network")
print("security threats. The BETH analysis demonstrated TF-IDF feature engineering's")
print("effectiveness for anomaly detection, while UNSW-NB15 showed that supervised")
print("ensemble methods with feature selection achieve excellent attack classification.")
print("The two-stage pipeline provides a realistic security operations workflow,")
print("first detecting attacks, then classifying types for incident response.")
print("\n" + "="*100)

## 9. Export Comparison Results

In [None]:
# Save comparison dataframe
comparison_df.to_csv(RESULTS_DIR / 'cross_dataset_comparison.csv', index=False)
print("[SAVED] Cross-dataset comparison to results/cross_dataset_comparison.csv")

# Create summary statistics file
summary_stats = {}

if beth_results is not None:
    # Handle pivot format (model_type, metric, value) vs simple format
    if 'model_type' in beth_results.columns and 'metric' in beth_results.columns:
        enhanced_dr = beth_results[(beth_results['model_type'] == 'enhanced') & 
                                   (beth_results['metric'] == 'Detection Rate')]['value'].values
        enhanced_fpr = beth_results[(beth_results['model_type'] == 'enhanced') & 
                                    (beth_results['metric'] == 'False Positive Rate')]['value'].values
        
        summary_stats['beth_enhanced_detection_rate'] = enhanced_dr[0] if len(enhanced_dr) > 0 else None
        summary_stats['beth_enhanced_fpr'] = enhanced_fpr[0] if len(enhanced_fpr) > 0 else None
    elif 'detection_rate' in beth_results.columns:
        summary_stats['beth_best_detection_rate'] = beth_results['detection_rate'].max()
        summary_stats['beth_best_fpr'] = beth_results['fpr'].min() if 'fpr' in beth_results.columns else None

if unsw_results is not None:
    summary_stats['unsw_best_roc_auc'] = unsw_results['roc_auc'].max() if 'roc_auc' in unsw_results.columns else None
    summary_stats['unsw_best_f1'] = unsw_results['f1_score'].max() if 'f1_score' in unsw_results.columns else None
    summary_stats['unsw_best_model'] = unsw_results.loc[unsw_results['roc_auc'].idxmax(), 'model'] if 'roc_auc' in unsw_results.columns else None

if unsw_feature_selection_performance is not None:
    summary_stats['feature_selection_dimensionality_reduction_pct'] = (
        1 - unsw_feature_selection_performance.loc[
            unsw_feature_selection_performance['feature_set'] == 'selected_features', 'n_features'].values[0] /
        unsw_feature_selection_performance.loc[
            unsw_feature_selection_performance['feature_set'] == 'all_features', 'n_features'].values[0]
    ) * 100

if unsw_two_stage is not None:
    total = len(unsw_two_stage)
    correct = (unsw_two_stage['true_label'] == unsw_two_stage['final_prediction']).sum()
    summary_stats['two_stage_pipeline_accuracy'] = correct / total

# Save summary stats
summary_df = pd.DataFrame([summary_stats])
summary_df.to_csv(RESULTS_DIR / 'comparison_summary_statistics.csv', index=False)
print("[SAVED] Comparison summary statistics to results/comparison_summary_statistics.csv")

print("\nAll comparison results saved!")