# AI Firewall for Smart City Attacks


## Cell 1: Environment Setup and Data Loading

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import glob
import os
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

def load_ns3_data(dataset_path="/home/saim/ns3-mmwave-fresh/ML/Dataset/"):
    """Load NS-3 CSV files from your dataset folder"""
    
    # Get all CSV files
    csv_files = glob.glob(dataset_path + "*.csv")
    
    if not csv_files:
        print(f"No CSV files found in {dataset_path}")
        return None
    
    print(f"Found {len(csv_files)} CSV files:")
    for file in csv_files:
        print(f"  - {os.path.basename(file)}")
    
    # Load and combine all data
    all_dataframes = []
    
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            filename = os.path.basename(file)
            df['SourceFile'] = filename
            
            # Determine if it's an attack file based on filename
            attack_indicators = ['attack', 'malicious', 'ddos', 'apt', 'ransomware', 'botnet', 
                               'medical-enhanced', 'finance-enhanced', 'grid-enhanced', 
                               'portscan-enhanced', 'recon-enhanced', 'supply-enhanced',
                               'quantum-enhanced', 'blockchain-enhanced', 'edge-enhanced',
                               'gpsspoof-enhanced', 'mitm6g-enhanced', 'sidechannel-enhanced',
                               'slicing-enhanced', 'mlpoison-enhanced']
            
            is_attack_file = any(indicator in filename.lower() for indicator in attack_indicators)
            
            if 'Label' not in df.columns:
                # If no Label column, infer from filename and traffic characteristics
                if is_attack_file:
                    df['Label'] = 1  # Attack
                else:
                    df['Label'] = 0  # Normal
            
            all_dataframes.append(df)
            print(f"Loaded {len(df)} flows from {filename} (Attack file: {is_attack_file})")
            
        except Exception as e:
            print(f"Error loading {file}: {e}")
    
    if all_dataframes:
        combined_data = pd.concat(all_dataframes, ignore_index=True)
        return combined_data
    else:
        return None

def analyze_dataset_structure(df):
    """Analyze the structure and quality of the dataset"""
    
    print(" DATASET STRUCTURE ANALYSIS")
    print(f"Total flows: {len(df)}")
    print(f"Features: {len(df.columns)}")
    print(f"Columns: {list(df.columns)}")
    
    # Label distribution
    if 'Label' in df.columns:
        label_counts = df['Label'].value_counts()
        print(f"\nLabel Distribution:")
        print(f"  Normal (0): {label_counts.get(0, 0)} ({label_counts.get(0, 0)/len(df)*100:.1f}%)")
        print(f"  Attack (1): {label_counts.get(1, 0)} ({label_counts.get(1, 0)/len(df)*100:.1f}%)")
    
    # District distribution
    if 'District' in df.columns:
        print(f"\nDistrict Distribution:")
        district_counts = df['District'].value_counts()
        for district, count in district_counts.items():
            print(f"  {district}: {count} ({count/len(df)*100:.1f}%)")
    
    # Traffic type distribution  
    if 'TrafficType' in df.columns:
        print(f"\nTraffic Type Distribution:")
        traffic_counts = df['TrafficType'].value_counts()
        for traffic_type, count in traffic_counts.head(10).items():
            print(f"  {traffic_type}: {count} ({count/len(df)*100:.1f}%)")
    
    # Missing values
    missing_data = df.isnull().sum()
    if missing_data.any():
        print(f"\nMissing Values:")
        for col, missing_count in missing_data[missing_data > 0].items():
            print(f"  {col}: {missing_count} ({missing_count/len(df)*100:.1f}%)")
    else:
        print("\nNo missing values found")
    
    # Basic statistics for key numerical features
    numerical_cols = ['TxPackets', 'RxPackets', 'TxBytes', 'RxBytes', 
                     'Duration', 'Throughput', 'PacketLoss', 'Delay', 'Jitter']
    available_numerical = [col for col in numerical_cols if col in df.columns]
    
    if available_numerical:
        print(f"\nNumerical Features Statistics:")
        print(df[available_numerical].describe())
    
    return df

# Load the actual data
print(" LOADING NS-3 SIMULATION DATA")
data = load_ns3_data()

if data is not None:
    # Analyze dataset structure
    data = analyze_dataset_structure(data)
else:
    print("Failed to load data. Please check the dataset path.")

 LOADING NS-3 SIMULATION DATA
Found 19 CSV files:
  - mixed-enhanced-flows.csv
  - medical-enhanced-flows.csv
  - mitm6g-enhanced-flows.csv
  - recon-enhanced-flows.csv
  - portscan-enhanced-flows.csv
  - ddos-enhanced-flows.csv
  - grid-enhanced-flows.csv
  - finance-enhanced-flows.csv
  - sidechannel-enhanced-flows.csv
  - botnet-enhanced-flows.csv
  - supply-enhanced-flows.csv
  - quantum-enhanced-flows.csv
  - slicing-enhanced-flows.csv
  - mlpoison-enhanced-flows.csv
  - ransomware-enhanced-flows.csv
  - blockchain-enhanced-flows.csv
  - normal-enhanced-flows.csv
  - gpsspoof-enhanced-flows.csv
  - edge-enhanced-flows.csv
Loaded 129 flows from mixed-enhanced-flows.csv (Attack file: False)
Loaded 21 flows from medical-enhanced-flows.csv (Attack file: True)
Loaded 21 flows from mitm6g-enhanced-flows.csv (Attack file: True)
Loaded 30 flows from recon-enhanced-flows.csv (Attack file: True)
Loaded 27 flows from portscan-enhanced-flows.csv (Attack file: True)
Loaded 24 flows from ddos-e

## Cell 2: Data Preprocessing and Feature Engineering

In [18]:
def preprocess_for_service_compatibility(df):
    """Preprocess data to match EXACTLY what the ML service expects (16 features)"""
    
    print("PREPROCESSING DATA FOR SERVICE COMPATIBILITY")
    print(f"Input data shape: {df.shape}")
    
    # Handle missing values and infinite values
    df_clean = df.copy()
    df_clean = df_clean.replace([np.inf, -np.inf], np.nan)
    df_clean = df_clean.fillna(0)
    
    # Create feature matrix exactly as ML service does
    feature_matrix = []
    skipped_rows = 0
    
    for idx, row in df_clean.iterrows():
        try:
            # Core 9 features
            features = [
                float(row['TxPackets']), float(row['RxPackets']), float(row['TxBytes']),
                float(row['RxBytes']), float(row['Duration']), float(row['Throughput']),
                float(row['PacketLoss']), float(row['Delay']), float(row['Jitter'])
            ]
            
            # Derived features 
            features.extend([
                float(row['RxPackets']) / (float(row['TxPackets']) + 1e-6),  # PacketDeliveryRatio
                float(row['RxBytes']) / (float(row['TxBytes']) + 1e-6),      # ByteDeliveryRatio
                float(row['TxBytes']) / (float(row['TxPackets']) + 1e-6),    # AvgPacketSize
                float(row['Throughput']) / (float(row['Duration']) + 1e-6),  # ThroughputEfficiency
                1 if float(row['DstPort']) <= 1023 else 0,                   # IsWellKnownPort
                1 if float(row['DstPort']) in [80, 443, 8080] else 0,        # IsWebPort
                1 if float(row['DstPort']) in [31337, 12345] else 0          # IsSuspiciousPort
            ])
            
            # Verify we have exactly 16 features
            if len(features) != 16:
                print(f"Warning: Row {idx} has {len(features)} features instead of 16")
                continue
                
            feature_matrix.append(features)
            
        except (ValueError, KeyError) as e:
            skipped_rows += 1
            if skipped_rows <= 5:  # Only show first 5 errors
                print(f"Skipped row {idx}: {e}")
            continue
    
    if skipped_rows > 0:
        print(f"Skipped {skipped_rows} rows due to processing errors")
    
    # Convert to numpy array
    X = np.array(feature_matrix)
    
    # Get labels if available
    y = None
    if 'Label' in df_clean.columns:
        # Only include labels for successfully processed rows
        valid_indices = [i for i in range(len(df_clean)) if i < len(feature_matrix)]
        y = df_clean.iloc[valid_indices]['Label'].values
    
    print(f"Final feature matrix shape: {X.shape}")
    print(f"Features per sample: {X.shape[1]} (should be 16)")
    
    if y is not None:
        print(f"Labels shape: {y.shape}")
        print(f"Attack samples: {sum(y)} ({sum(y)/len(y)*100:.1f}%)")
        print(f"Normal samples: {sum(y==0)} ({sum(y==0)/len(y)*100:.1f}%)")
    
    # Feature statistics
    print("\nFeature Statistics:")
    feature_names = [
        'TxPackets', 'RxPackets', 'TxBytes', 'RxBytes', 'Duration', 
        'Throughput', 'PacketLoss', 'Delay', 'Jitter',
        'PacketDeliveryRatio', 'ByteDeliveryRatio', 'AvgPacketSize', 
        'ThroughputEfficiency', 'IsWellKnownPort', 'IsWebPort', 'IsSuspiciousPort'
    ]
    
    for i, name in enumerate(feature_names):
        if X.shape[1] > i:
            print(f"  {name}: mean={X[:, i].mean():.4f}, std={X[:, i].std():.4f}, range=[{X[:, i].min():.4f}, {X[:, i].max():.4f}]")
    
    return X, y, feature_names

# Preprocess the data
if data is not None:
    print("\n" + "="*80)
    print("DATA PREPROCESSING - SERVICE COMPATIBLE")
    print("="*80)
    
    X_raw, y_true, feature_names = preprocess_for_service_compatibility(data)
    
    if X_raw.shape[1] != 16:
        print(f"ERROR: Expected 16 features, got {X_raw.shape[1]}")
        print("This will cause mismatch with ML service!")
    else:
        print("✓ Feature matrix has correct 16 features for service compatibility")
    
    # Feature scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_raw)
    
    print(f"\nScaled feature matrix shape: {X_scaled.shape}")
    print(f"Scaling completed - features normalized for model training")
    
    # Data split for proper training/testing
    if y_true is not None:
        print(f"\nData Distribution:")
        print(f"Total flows: {len(y_true)}")
        print(f"Normal flows: {sum(y_true==0)} ({sum(y_true==0)/len(y_true)*100:.1f}%)")
        print(f"Attack flows: {sum(y_true==1)} ({sum(y_true==1)/len(y_true)*100:.1f}%)")
        
        # Calculate actual contamination rate
        contamination_rate = sum(y_true) / len(y_true)
        print(f"Actual contamination rate: {contamination_rate:.3f}")
    else:
        contamination_rate = 0.1
        print("No labels available - using default contamination rate: 0.1")


DATA PREPROCESSING - SERVICE COMPATIBLE
PREPROCESSING DATA FOR SERVICE COMPATIBILITY
Input data shape: (510, 19)
Final feature matrix shape: (510, 16)
Features per sample: 16 (should be 16)
Labels shape: (510,)
Attack samples: 225 (44.1%)
Normal samples: 285 (55.9%)

Feature Statistics:
  TxPackets: mean=563.7804, std=517.6342, range=[2.0000, 2000.0000]
  RxPackets: mean=356.5098, std=540.8774, range=[0.0000, 1800.0000]
  TxBytes: mean=361390.2431, std=547355.3819, range=[120.0000, 2570400.0000]
  RxBytes: mean=189875.6078, std=493858.3753, range=[0.0000, 2570400.0000]
  Duration: mean=4.8381, std=75.8481, range=[-110.0000, 135.0010]
  Throughput: mean=33469.4581, std=83546.7367, range=[0.0000, 843151.0000]
  PacketLoss: mean=0.5240, std=0.4958, range=[0.0000, 1.0000]
  Delay: mean=0.0310, std=0.1783, range=[0.0000, 1.3373]
  Jitter: mean=0.0011, std=0.0019, range=[0.0000, 0.0107]
  PacketDeliveryRatio: mean=0.4760, std=0.4958, range=[0.0000, 1.0000]
  ByteDeliveryRatio: mean=0.4760, 

## Cell 3: Isolation Forest Training and Detailed Analysis

In [None]:
# Required imports
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, confusion_matrix)

def train_isolation_forest(X_scaled, y_true, contamination_rate, test_size=0.3, random_state=42):
    """Train Isolation Forest model"""
    
    # Split data 
    if y_true is not None and len(np.unique(y_true)) > 1:
        stratify_param = y_true
    else:
        stratify_param = None
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_true, test_size=test_size, random_state=random_state, 
        stratify=stratify_param
    )
    
    print("ISOLATION FOREST TRAINING")
    print(f"Training data shape: {X_train.shape}")
    print(f"Contamination rate: {contamination_rate}")
    print(f"Number of estimators: 200")
    
    # Train model
    iso_model = IsolationForest(
        contamination=contamination_rate,
        n_estimators=200,
        max_samples='auto',
        max_features=1.0,
        bootstrap=False,
        random_state=random_state,
        n_jobs=-1
    )
    
    iso_model.fit(X_train)
    print("Isolation Forest training completed")
    print(f"Model uses {iso_model.n_features_in_} features")
    
    return iso_model, X_train, X_test, y_train, y_test

def analyze_isolation_forest(model, X_test, y_test):
    """Analyze Isolation Forest predictions"""
    
    print("ISOLATION FOREST PREDICTION ANALYSIS")
    
    # Get predictions and scores
    iso_scores = model.decision_function(X_test)
    iso_pred_raw = model.predict(X_test)
    iso_pred = (iso_pred_raw == -1).astype(int)
    
    print(f"Predictions completed for {len(iso_pred)} flows")
    print(f"Detected anomalies: {sum(iso_pred)} ({sum(iso_pred)/len(iso_pred)*100:.1f}%)")
    
    # Score statistics
    print("ISO Score Statistics")
    print(f"Score range: [{iso_scores.min():.4f}, {iso_scores.max():.4f}]")
    print(f"Mean score: {iso_scores.mean():.4f}")
    print(f"Standard deviation: {iso_scores.std():.4f}")
    
    # Initialize variables to avoid UnboundLocalError
    best_threshold = 0.0
    iso_pred_optimal = iso_pred
    accuracy = precision = recall = f1 = None
    
    # Score distribution by label
    if y_test is not None:
        normal_scores = iso_scores[y_test == 0]
        attack_scores = iso_scores[y_test == 1]
        
        print("Score Distribution by True Label")
        print(f"Normal traffic scores:")
        print(f"  Mean: {normal_scores.mean():.4f}, Std: {normal_scores.std():.4f}")
        print(f"  Range: [{normal_scores.min():.4f}, {normal_scores.max():.4f}]")
        
        print(f"Attack traffic scores:")
        print(f"  Mean: {attack_scores.mean():.4f}, Std: {attack_scores.std():.4f}")
        print(f"  Range: [{attack_scores.min():.4f}, {attack_scores.max():.4f}]")
        
        # Find optimal threshold
        thresholds = np.percentile(iso_scores, range(5, 50, 5))
        best_f1 = 0
        best_threshold = thresholds[0]  # Initialize with first threshold
        
        for threshold in thresholds:
            pred_threshold = (iso_scores <= threshold).astype(int)
            f1_temp = f1_score(y_test, pred_threshold, zero_division=0)
            if f1_temp > best_f1:
                best_f1 = f1_temp
                best_threshold = threshold
        
        # Use optimal threshold
        iso_pred_optimal = (iso_scores <= best_threshold).astype(int)
        
        # Performance metrics
        accuracy = accuracy_score(y_test, iso_pred_optimal)
        precision = precision_score(y_test, iso_pred_optimal, zero_division=0)
        recall = recall_score(y_test, iso_pred_optimal, zero_division=0)
        f1 = f1_score(y_test, iso_pred_optimal, zero_division=0)
        
        print("Performance Metrics")
        print(f"Accuracy:  {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall:    {recall:.4f}")
        print(f"F1-Score:  {f1:.4f}")
        
        # Confusion matrix
        cm = confusion_matrix(y_test, iso_pred_optimal)
        print("Confusion Matrix")
        print("                 Predicted")
        print("Actual    Normal  Anomaly")
        print(f"Normal    {cm[0,0]:6d}  {cm[0,1]:7d}")
        print(f"Anomaly   {cm[1,0]:6d}  {cm[1,1]:7d}")
        
        # Error rates
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
        
        print(f"False Positive Rate: {fpr:.4f} ({fpr*100:.2f}%)")
        print(f"False Negative Rate: {fnr:.4f} ({fnr*100:.2f}%)")
    
    print("ISOLATION FOREST ANALYSIS COMPLETED")
    
    return {
        'model': model,
        'threshold': best_threshold,
        'scores': iso_scores,
        'predictions': iso_pred_optimal,
        'metrics': {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        } if y_test is not None else None
    }

# Train and analyze Isolation Forest
if 'X_scaled' in locals() and 'y_true' in locals():
    iso_model, X_train, X_test, y_train, y_test = train_isolation_forest(X_scaled, y_true, contamination_rate)
    iso_results = analyze_isolation_forest(iso_model, X_test, y_test)
else:
    print("No data available for training. Please run Cell 2 first.")

ISOLATION FOREST TRAINING
Training data shape: (357, 16)
Contamination rate: 0.4411764705882353
Number of estimators: 200
Isolation Forest training completed
Model uses 16 features
ISOLATION FOREST PREDICTION ANALYSIS
Predictions completed for 153 flows
Detected anomalies: 65 (42.5%)
ISO Score Statistics
Score range: [-0.2446, 0.0488]
Mean score: -0.0114
Standard deviation: 0.0635
Score Distribution by True Label
Normal traffic scores:
  Mean: 0.0059, Std: 0.0445
  Range: [-0.1379, 0.0488]
Attack traffic scores:
  Mean: -0.0336, Std: 0.0761
  Range: [-0.2446, 0.0405]
Performance Metrics
Accuracy:  0.7386
Precision: 0.8462
Recall:    0.4925
F1-Score:  0.6226
Confusion Matrix
                 Predicted
Actual    Normal  Anomaly
Normal        80        6
Anomaly       34       33
False Positive Rate: 0.0698 (6.98%)
False Negative Rate: 0.5075 (50.75%)
ISOLATION FOREST ANALYSIS COMPLETED


## Cell 4: Local Outlier Factor (LOF) Training

In [10]:
# Required imports
import numpy as np
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, confusion_matrix)

def train_lof_model(X_train, contamination_rate, n_neighbors=20):
    """Train LOF model"""
    
    print("LOCAL OUTLIER FACTOR TRAINING")
    print(f"Training data shape: {X_train.shape}")
    print(f"Contamination rate: {contamination_rate}")
    print(f"Number of neighbors: {n_neighbors}")
    
    # Adjust neighbors if needed
    max_neighbors = min(n_neighbors, X_train.shape[0] - 1)
    if max_neighbors != n_neighbors:
        print(f"Adjusting n_neighbors from {n_neighbors} to {max_neighbors}")
        n_neighbors = max_neighbors
    
    # Train model
    lof_model = LocalOutlierFactor(
        n_neighbors=n_neighbors,
        contamination=contamination_rate,
        novelty=True,
        algorithm='auto',
        n_jobs=-1
    )
    
    lof_model.fit(X_train)
    print("LOF training completed")
    print(f"Model uses {lof_model.n_features_in_} features")
    
    return lof_model

def analyze_lof_model(model, X_test, y_test):
    """Analyze LOF predictions"""
    
    print("LOCAL OUTLIER FACTOR PREDICTION ANALYSIS")
    
    # Get predictions and scores
    lof_pred_raw = model.predict(X_test)
    lof_pred = (lof_pred_raw == -1).astype(int)
    lof_scores = -model.decision_function(X_test)
    
    print(f"Predictions completed for {len(lof_pred)} flows")
    print(f"Detected anomalies: {sum(lof_pred)} ({sum(lof_pred)/len(lof_pred)*100:.1f}%)")
    
    # Score statistics
    print("LOF Score Statistics")
    print(f"Score range: [{lof_scores.min():.4f}, {lof_scores.max():.4f}]")
    print(f"Mean score: {lof_scores.mean():.4f}")
    print(f"Standard deviation: {lof_scores.std():.4f}")
    
    # Initialize variables to avoid UnboundLocalError
    accuracy = precision = recall = f1 = None
    
    # Score distribution by label
    if y_test is not None:
        normal_scores = lof_scores[y_test == 0]
        attack_scores = lof_scores[y_test == 1]
        
        print("Score Distribution by True Label")
        print(f"Normal traffic scores:")
        print(f"  Mean: {normal_scores.mean():.4f}, Std: {normal_scores.std():.4f}")
        print(f"  Range: [{normal_scores.min():.4f}, {normal_scores.max():.4f}]")
        
        print(f"Attack traffic scores:")
        print(f"  Mean: {attack_scores.mean():.4f}, Std: {attack_scores.std():.4f}")
        print(f"  Range: [{attack_scores.min():.4f}, {attack_scores.max():.4f}]")
        
        # Performance metrics
        accuracy = accuracy_score(y_test, lof_pred)
        precision = precision_score(y_test, lof_pred, zero_division=0)
        recall = recall_score(y_test, lof_pred, zero_division=0)
        f1 = f1_score(y_test, lof_pred, zero_division=0)
        
        print("Performance Metrics")
        print(f"Accuracy:  {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall:    {recall:.4f}")
        print(f"F1-Score:  {f1:.4f}")
        
        # Confusion matrix
        cm = confusion_matrix(y_test, lof_pred)
        print("Confusion Matrix")
        print("                 Predicted")
        print("Actual    Normal  Anomaly")
        print(f"Normal    {cm[0,0]:6d}  {cm[0,1]:7d}")
        print(f"Anomaly   {cm[1,0]:6d}  {cm[1,1]:7d}")
        
        # Error rates
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
        
        print(f"False Positive Rate: {fpr:.4f} ({fpr*100:.2f}%)")
        print(f"False Negative Rate: {fnr:.4f} ({fnr*100:.2f}%)")
    
    print("LOF ANALYSIS COMPLETED")
    
    return {
        'model': model,
        'scores': lof_scores,
        'predictions': lof_pred,
        'metrics': {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        } if y_test is not None else None
    }

# Train and analyze LOF
if 'X_train' in locals():
    lof_model = train_lof_model(X_train, contamination_rate)
    lof_results = analyze_lof_model(lof_model, X_test, y_test)
else:
    print("No training data available. Please run Cell 3 first.")

LOCAL OUTLIER FACTOR TRAINING
Training data shape: (357, 16)
Contamination rate: 0.4411764705882353
Number of neighbors: 20
LOF training completed
Model uses 16 features
LOCAL OUTLIER FACTOR PREDICTION ANALYSIS
Predictions completed for 153 flows
Detected anomalies: 63 (41.2%)
LOF Score Statistics
Score range: [-0.1357, 551.0337]
Mean score: 16.0014
Standard deviation: 76.7227
Score Distribution by True Label
Normal traffic scores:
  Mean: 13.2082, Std: 83.0003
  Range: [-0.0621, 551.0337]
Attack traffic scores:
  Mean: 19.5866, Std: 67.6497
  Range: [-0.1357, 305.7161]
Performance Metrics
Accuracy:  0.6863
Precision: 0.6508
Recall:    0.6119
F1-Score:  0.6308
Confusion Matrix
                 Predicted
Actual    Normal  Anomaly
Normal        64       22
Anomaly       26       41
False Positive Rate: 0.2558 (25.58%)
False Negative Rate: 0.3881 (38.81%)
LOF ANALYSIS COMPLETED


## Cell 5: Performance Comparison

In [11]:
def compare_model_performance(iso_results, lof_results):
    """Compare performance between Isolation Forest and LOF"""
    
    print("MODEL PERFORMANCE COMPARISON")
    print("-"*50)
    
    # Extract metrics
    iso_metrics = iso_results['metrics'] if 'metrics' in iso_results else performance_results['iso']
    lof_metrics = lof_results['metrics']
    
    # Performance table
    print(f"{'Metric':<15} {'Isolation Forest':<18} {'LOF':<15} {'Difference':<12}")
    print("-" * 60)
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for metric in metrics:
        iso_val = iso_metrics[metric]
        lof_val = lof_metrics[metric]
        diff = iso_val - lof_val
        
        print(f"{metric.title():<15} {iso_val:<18.4f} {lof_val:<15.4f} {diff:<12.4f}")
    
    # Best model
    print(f"\nBest F1-Score: ", end="")
    if iso_metrics['f1_score'] > lof_metrics['f1_score']:
        print(f"Isolation Forest ({iso_metrics['f1_score']:.4f})")
    else:
        print(f"LOF ({lof_metrics['f1_score']:.4f})")

def plot_model_comparison(iso_results, lof_results, models):
    """Create comparison plots"""
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    y_test = models['y_test']
    
    # 1. Score distributions
    ax1 = axes[0]
    iso_scores = iso_results['scores'] if 'scores' in iso_results else performance_results['iso']['scores']
    lof_scores = lof_results['scores']
    
    ax1.hist(iso_scores[y_test == 0], bins=30, alpha=0.5, label='ISO Normal', color='blue')
    ax1.hist(iso_scores[y_test == 1], bins=30, alpha=0.5, label='ISO Attack', color='red')
    ax1.set_xlabel('Isolation Forest Score')
    ax1.set_ylabel('Frequency')
    ax1.set_title('ISO Score Distribution')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 2. LOF scores
    ax2 = axes[1]
    ax2.hist(lof_scores[y_test == 0], bins=30, alpha=0.5, label='LOF Normal', color='green')
    ax2.hist(lof_scores[y_test == 1], bins=30, alpha=0.5, label='LOF Attack', color='orange')
    ax2.set_xlabel('LOF Score')
    ax2.set_ylabel('Frequency')
    ax2.set_title('LOF Score Distribution')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 3. F1 Score comparison
    ax3 = axes[2]
    iso_f1 = iso_results['metrics']['f1_score'] if 'metrics' in iso_results else performance_results['iso']['f1_score']
    lof_f1 = lof_results['metrics']['f1_score']
    
    models_names = ['Isolation Forest', 'LOF']
    f1_scores = [iso_f1, lof_f1]
    colors = ['skyblue', 'lightgreen']
    
    bars = ax3.bar(models_names, f1_scores, color=colors)
    ax3.set_ylabel('F1-Score')
    ax3.set_title('F1-Score Comparison')
    ax3.set_ylim(0, 1)
    
    # Add value labels on bars
    for bar, score in zip(bars, f1_scores):
        ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{score:.3f}', ha='center', va='bottom', fontweight='bold')
    
    ax3.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Compare models if both are available
if 'performance_results' in locals() and 'lof_results' in locals():
    compare_model_performance(performance_results, lof_results)
    plot_model_comparison(performance_results, lof_results, models)
    
    print("\nANALYSIS COMPLETED")
    print("Both models trained and compared")

elif 'lof_results' in locals():
    print("LOF model trained. Run Isolation Forest training to compare.")
else:
    print("No models available for comparison.")

LOF model trained. Run Isolation Forest training to compare.


## Cell 6: Generate Dynamic Attack Simulations with Loops

In [12]:
def generate_dynamic_attack_scenarios():
    """Generate dynamic attack scenarios using loops for variety"""
    
    print("GENERATING DYNAMIC ATTACK SCENARIOS")
    
    attack_scenarios = []
    scenario_id = 1
    
    # 1. Port Scanning Loop - Different ports and districts
    districts = ['IoT', 'Hospital', 'PowerGrid', 'Finance']
    common_ports = [22, 23, 80, 443, 3389, 1433, 5432]
    
    for district in districts:
        for port in common_ports[:4]:  # Test first 4 ports per district
            base_ip = {'IoT': '192.168.50', 'Hospital': '192.168.10', 
                      'PowerGrid': '192.168.20', 'Finance': '192.168.30'}
            
            attack_scenarios.append({
                'ScenarioID': scenario_id,
                'SrcIP': f"{base_ip[district]}.200",
                'DstIP': f"{base_ip[district]}.{10 + scenario_id}",
                'DstPort': port,
                'TxPackets': np.random.randint(1, 5),
                'RxPackets': 0,  # No response - typical for port scan
                'Duration': np.random.uniform(0.1, 1.0),
                'PacketLoss': 1.0,  # 100% loss
                'Throughput': np.random.randint(50, 200),
                'Delay': 0.001,
                'Jitter': 0.0001,
                'District': district,
                'AttackType': 'port_scan',
                'Description': f"Port scan on {district} port {port}"
            })
            scenario_id += 1
    
    # 2. DoS Attack Loop - Varying intensity
    dos_intensities = [
        {'packets': 1000, 'loss': 0.3, 'duration': 5},
        {'packets': 5000, 'loss': 0.7, 'duration': 10},
        {'packets': 10000, 'loss': 0.9, 'duration': 15}
    ]
    
    for i, intensity in enumerate(dos_intensities):
        for district in ['Finance', 'Hospital']:
            base_ip = {'Finance': '192.168.30', 'Hospital': '192.168.10'}
            
            attack_scenarios.append({
                'ScenarioID': scenario_id,
                'SrcIP': f"{base_ip[district]}.150",
                'DstIP': f"{base_ip[district]}.5",
                'DstPort': 80,
                'TxPackets': intensity['packets'],
                'RxPackets': int(intensity['packets'] * (1 - intensity['loss'])),
                'Duration': intensity['duration'],
                'PacketLoss': intensity['loss'],
                'Throughput': intensity['packets'] * 1024 / intensity['duration'],
                'Delay': 0.05 + (i * 0.05),
                'Jitter': 0.01 + (i * 0.01),
                'District': district,
                'AttackType': 'dos_attack',
                'Description': f"DoS level {i+1} on {district}"
            })
            scenario_id += 1
    
    # 3. Data Exfiltration Loop - Different volumes
    data_volumes = [100000, 500000, 2000000]  # Bytes
    
    for volume in data_volumes:
        attack_scenarios.append({
            'ScenarioID': scenario_id,
            'SrcIP': '192.168.10.25',
            'DstIP': '8.8.8.8',  # External server
            'DstPort': 443,
            'TxPackets': volume // 1024,
            'RxPackets': int((volume // 1024) * 0.9),
            'Duration': 60 + (volume // 50000),
            'PacketLoss': 0.1,
            'Throughput': volume * 8 / (60 + (volume // 50000)),
            'Delay': 0.08,
            'Jitter': 0.02,
            'District': 'Hospital',
            'AttackType': 'data_exfiltration',
            'Description': f"Data exfiltration {volume//1000}KB"
        })
        scenario_id += 1
    
    # 4. Lateral Movement Loop - Different targets
    grid_targets = [10, 15, 20, 25, 30]
    
    for target in grid_targets:
        attack_scenarios.append({
            'ScenarioID': scenario_id,
            'SrcIP': '192.168.20.10',  # Compromised device
            'DstIP': f'192.168.20.{target}',
            'DstPort': 22,
            'TxPackets': np.random.randint(30, 60),
            'RxPackets': np.random.randint(25, 55),
            'Duration': np.random.uniform(60, 180),
            'PacketLoss': np.random.uniform(0.05, 0.15),
            'Throughput': np.random.randint(200, 500),
            'Delay': 0.02,
            'Jitter': 0.003,
            'District': 'PowerGrid',
            'AttackType': 'lateral_movement',
            'Description': f"Lateral movement to grid node {target}"
        })
        scenario_id += 1
    
    return pd.DataFrame(attack_scenarios)

# Generate the dynamic scenarios
dynamic_attacks = generate_dynamic_attack_scenarios()
print(f"Generated {len(dynamic_attacks)} dynamic attack scenarios")
print("\nScenario Types:")
print(dynamic_attacks['AttackType'].value_counts())
print("\nDistrict Distribution:")
print(dynamic_attacks['District'].value_counts())

GENERATING DYNAMIC ATTACK SCENARIOS
Generated 30 dynamic attack scenarios

Scenario Types:
AttackType
port_scan            16
dos_attack            6
lateral_movement      5
data_exfiltration     3
Name: count, dtype: int64

District Distribution:
District
Hospital     10
PowerGrid     9
Finance       7
IoT           4
Name: count, dtype: int64


## Cell 7: Testing Isolation Forest on unseen data.

In [None]:
import numpy as np

def get_security_context(row):
    """Generate security context description for an attack"""
    contexts = [
        f"High traffic volume attack ({row['TxPackets']} packets)",
        f"Suspicious port activity (Port {row['DstPort']})",
        f"Network infiltration attempt",
        f"Anomalous data flow pattern",
        f"Potential DoS attack vector",
        f"Unauthorized access attempt",
        f"Data exfiltration pattern detected"
    ]
    
    # Choose context based on row characteristics
    if row.get('DstPort', 0) in [31337, 12345]:
        return "Backdoor/Trojan communication attempt"
    elif row.get('DstPort', 0) in [80, 443]:
        return "Web-based attack vector"
    elif row.get('TxPackets', 0) > 1000:
        return "High-volume traffic anomaly"
    else:
        return contexts[hash(str(row.get('SrcIP', ''))) % len(contexts)]

def test_isolation_forest_with_corrected_threshold(attack_df, iso_model, threshold, scaler_obj, feature_names):
    """Test Isolation Forest with corrected threshold logic"""
    
    print("ISOLATION FOREST SECURITY ANALYSIS")
    
    # Prepare test data
    test_data = attack_df.copy()
    defaults = {'TxBytes': 1024, 'RxBytes': 0, 'Protocol': 6, 'SrcPort': 12345}
    for col, val in defaults.items():
        if col not in test_data.columns:
            test_data[col] = val
    
    # Add derived features (matching your training)
    test_data['PacketDeliveryRatio'] = test_data['RxPackets'] / (test_data['TxPackets'] + 1e-6)
    test_data['ByteDeliveryRatio'] = test_data['RxPackets'] * 1024 / (test_data['TxPackets'] * 1024 + 1e-6)
    test_data['AvgPacketSize'] = (test_data['TxPackets'] * 1024) / (test_data['TxPackets'] + 1e-6)
    test_data['ThroughputEfficiency'] = test_data['Throughput'] / (test_data['Duration'] + 1e-6)
    test_data['IsWellKnownPort'] = (test_data['DstPort'] <= 1023).astype(int)
    test_data['IsWebPort'] = test_data['DstPort'].isin([80, 443, 8080]).astype(int)
    test_data['IsSuspiciousPort'] = test_data['DstPort'].isin([31337, 12345]).astype(int)
    
    # Create feature matrix
    feature_matrix = []
    for idx, row in test_data.iterrows():
        features = [
            row['TxPackets'], row['RxPackets'], row['TxBytes'], row['RxBytes'],
            row['Duration'], row['Throughput'], row['PacketLoss'], row['Delay'], row['Jitter'],
            row['PacketDeliveryRatio'], row['ByteDeliveryRatio'], row['AvgPacketSize'],
            row['ThroughputEfficiency'], row['IsWellKnownPort'], row['IsWebPort'], row['IsSuspiciousPort']
        ]
        feature_matrix.append(features)
    
    X_test = np.array(feature_matrix)
    X_test_scaled = scaler_obj.transform(X_test)
    
    # Get predictions with corrected logic
    iso_scores = iso_model.decision_function(X_test_scaled)
    
    # For synthetic attacks, use a more aggressive threshold
    corrected_threshold = 0.05  # Higher threshold to catch more attacks
    iso_preds = (iso_scores <= corrected_threshold)  # Lower scores = more anomalous
    
    print(f"Using corrected threshold: {corrected_threshold}")
    print(f"Score range: [{iso_scores.min():.4f}, {iso_scores.max():.4f}]")
    print(f"Mean score: {iso_scores.mean():.4f}")
    
    # Analyze each attack
    blocked_attacks = 0
    
    for i, (idx, row) in enumerate(test_data.iterrows()):
        security_context = get_security_context(row)
        
        print(f"\nSECURITY INCIDENT #{i+1}")
        print(f"THREAT ANALYSIS: {security_context}")
        print(f"Source: {row['SrcIP']} -> Target: {row['DstIP']}:{row['DstPort']}")
        print(f"Network: {row['District']} District")
        
        print(f"\nISOLATION FOREST DECISION:")
        print(f"Anomaly Score: {iso_scores[i]:.4f}")
        print(f"Threshold: {corrected_threshold}")
        
        if iso_preds[i]:
            print(f"DECISION: THREAT BLOCKED")
            print(f"REASON: Score {iso_scores[i]:.4f} <= {corrected_threshold} indicates anomaly")
            print(f"ACTION: Connection terminated, incident logged")
            blocked_attacks += 1
        else:
            print(f"DECISION: TRAFFIC ALLOWED") 
            print(f"REASON: Score {iso_scores[i]:.4f} > {corrected_threshold} appears normal")
            print(f"ACTION: Monitoring continues")
        
        print("-" * 70)
    
    print(f"\nCORRECTED ISOLATION FOREST SUMMARY:")
    print(f"Total Incidents: {len(test_data)}")
    print(f"Threats Blocked: {blocked_attacks}")
    print(f"Threats Missed: {len(test_data) - blocked_attacks}")
    print(f"Protection Rate: {(blocked_attacks/len(test_data))*100:.1f}%")
    
    # Try even more aggressive threshold if still low
    if blocked_attacks == 0:
        print(f"\nTRYING MORE AGGRESSIVE THRESHOLD...")
        aggressive_threshold = 0.1
        aggressive_preds = (iso_scores <= aggressive_threshold)
        aggressive_blocked = sum(aggressive_preds)
        print(f"With threshold {aggressive_threshold}: {aggressive_blocked}/{len(test_data)} blocked ({(aggressive_blocked/len(test_data))*100:.1f}%)")
    
    return {'scores': iso_scores, 'predictions': iso_preds, 'threshold_used': corrected_threshold}

# Test with corrected threshold
if 'dynamic_attacks' in locals() and 'iso_results' in locals():
    corrected_results = test_isolation_forest_with_corrected_threshold(
        dynamic_attacks, 
        iso_results['model'], 
        iso_results['threshold'],
        scaler,  # Your scaler from training
        feature_names
    )
else:
    print("Missing required data. Run previous cells first.")

ISOLATION FOREST SECURITY ANALYSIS
Using corrected threshold: 0.05
Score range: [-0.2518, -0.0801]
Mean score: -0.1546

SECURITY INCIDENT #1
THREAT ANALYSIS: Potential DoS attack vector
Source: 192.168.50.200 -> Target: 192.168.50.11:22
Network: IoT District

ISOLATION FOREST DECISION:
Anomaly Score: -0.1078
Threshold: 0.05
DECISION: THREAT BLOCKED
REASON: Score -0.1078 <= 0.05 indicates anomaly
ACTION: Connection terminated, incident logged
----------------------------------------------------------------------

SECURITY INCIDENT #2
THREAT ANALYSIS: Potential DoS attack vector
Source: 192.168.50.200 -> Target: 192.168.50.12:23
Network: IoT District

ISOLATION FOREST DECISION:
Anomaly Score: -0.1080
Threshold: 0.05
DECISION: THREAT BLOCKED
REASON: Score -0.1080 <= 0.05 indicates anomaly
ACTION: Connection terminated, incident logged
----------------------------------------------------------------------

SECURITY INCIDENT #3
THREAT ANALYSIS: Web-based attack vector
Source: 192.168.50.200

## cell 8: Tesrting IOF on unseen data

In [None]:

import numpy as np

def get_security_context(row):
    """Generate security context description for an attack"""
    contexts = [
        f"High traffic volume attack ({row['TxPackets']} packets)",
        f"Suspicious port activity (Port {row['DstPort']})",
        f"Network infiltration attempt",
        f"Anomalous data flow pattern",
        f"Potential DoS attack vector",
        f"Unauthorized access attempt",
        f"Data exfiltration pattern detected"
    ]
    
    # Choose context based on row characteristics
    if row.get('DstPort', 0) in [31337, 12345]:
        return "Backdoor/Trojan communication attempt"
    elif row.get('DstPort', 0) in [80, 443]:
        return "Web-based attack vector"
    elif row.get('TxPackets', 0) > 1000:
        return "High-volume traffic anomaly"
    else:
        return contexts[hash(str(row.get('SrcIP', ''))) % len(contexts)]

def test_lof_with_context(attack_df, lof_model, scaler, feature_names=None):
    """Test LOF with detailed security context"""
    
    print("LOF SECURITY ANALYSIS")
    
    # Same preprocessing
    test_data = attack_df.copy()
    defaults = {'TxBytes': 1024, 'RxBytes': 0, 'Protocol': 6, 'SrcPort': 12345}
    for col, val in defaults.items():
        if col not in test_data.columns:
            test_data[col] = val
    
    # Add derived features (matching your training)
    test_data['PacketDeliveryRatio'] = test_data['RxPackets'] / (test_data['TxPackets'] + 1e-6)
    test_data['ByteDeliveryRatio'] = test_data['RxPackets'] * 1024 / (test_data['TxPackets'] * 1024 + 1e-6)
    test_data['AvgPacketSize'] = (test_data['TxPackets'] * 1024) / (test_data['TxPackets'] + 1e-6)
    test_data['ThroughputEfficiency'] = test_data['Throughput'] / (test_data['Duration'] + 1e-6)
    test_data['IsWellKnownPort'] = (test_data['DstPort'] <= 1023).astype(int)
    test_data['IsWebPort'] = test_data['DstPort'].isin([80, 443, 8080]).astype(int)
    test_data['IsSuspiciousPort'] = test_data['DstPort'].isin([31337, 12345]).astype(int)
    
    # Create feature matrix (same as in training)
    feature_matrix = []
    for idx, row in test_data.iterrows():
        features = [
            row['TxPackets'], row['RxPackets'], row['TxBytes'], row['RxBytes'],
            row['Duration'], row['Throughput'], row['PacketLoss'], row['Delay'], row['Jitter'],
            row['PacketDeliveryRatio'], row['ByteDeliveryRatio'], row['AvgPacketSize'],
            row['ThroughputEfficiency'], row['IsWellKnownPort'], row['IsWebPort'], row['IsSuspiciousPort']
        ]
        feature_matrix.append(features)
    
    X_test = np.array(feature_matrix)
    X_test_scaled = scaler.transform(X_test)
    
    # Get LOF predictions
    lof_scores = -lof_model.score_samples(X_test_scaled)
    lof_preds = (lof_scores >= 1.5)  # Threshold for anomaly detection
    
    print(f"LOF Score range: [{lof_scores.min():.4f}, {lof_scores.max():.4f}]")
    print(f"Using threshold: 1.5")
    
    # Analyze with context
    blocked_attacks = 0
    
    for i, (idx, row) in enumerate(test_data.iterrows()):
        security_context = get_security_context(row)
        
        print(f"\nSECURITY INCIDENT #{i+1}")
        print(f"THREAT ANALYSIS: {security_context}")
        print(f"Source: {row['SrcIP']} -> Target: {row['DstIP']}:{row['DstPort']}")
        print(f"Network: {row['District']} District")
        
        print(f"\nLOF DECISION:")
        print(f"Outlier Factor: {lof_scores[i]:.4f}")
        
        if lof_preds[i]:
            print(f"DECISION: THREAT BLOCKED")
            print(f"REASON: Traffic behavior significantly different from local network neighborhood") 
            print(f"ACTION: Connection blocked, anomaly flagged for investigation")
            blocked_attacks += 1
        else:
            print(f"DECISION: TRAFFIC ALLOWED")
            print(f"REASON: Traffic behavior consistent with local network patterns")
            print(f"ACTION: Normal processing continues")
        
        print("-" * 70)
    
    print(f"\nLOF SUMMARY:")
    print(f"Total Incidents Analyzed: {len(test_data)}")
    print(f"Threats Blocked: {blocked_attacks}")
    print(f"Threats Missed: {len(test_data) - blocked_attacks}")
    print(f"Protection Rate: {(blocked_attacks/len(test_data))*100:.1f}%")
    
    # Try different threshold if detection rate is low
    if blocked_attacks == 0:
        print(f"\nTRYING LOWER THRESHOLD...")
        lower_threshold = 1.2
        lower_preds = (lof_scores >= lower_threshold)
        lower_blocked = sum(lower_preds)
        print(f"With threshold {lower_threshold}: {lower_blocked}/{len(test_data)} blocked ({(lower_blocked/len(test_data))*100:.1f}%)")
    
    return {'scores': lof_scores, 'predictions': lof_preds}

# Run LOF with context
if 'dynamic_attacks' in locals() and 'lof_results' in locals() and 'scaler' in locals():
    lof_context_results = test_lof_with_context(
        dynamic_attacks, 
        lof_results['model'], 
        scaler
    )
else:
    print("Missing required data. Please run previous cells first.")

LOF SECURITY ANALYSIS
LOF Score range: [2.1223, 811.6683]
Using threshold: 1.5

SECURITY INCIDENT #1
THREAT ANALYSIS: Potential DoS attack vector
Source: 192.168.50.200 -> Target: 192.168.50.11:22
Network: IoT District

LOF DECISION:
Outlier Factor: 3.8756
DECISION: THREAT BLOCKED
REASON: Traffic behavior significantly different from local network neighborhood
ACTION: Connection blocked, anomaly flagged for investigation
----------------------------------------------------------------------

SECURITY INCIDENT #2
THREAT ANALYSIS: Potential DoS attack vector
Source: 192.168.50.200 -> Target: 192.168.50.12:23
Network: IoT District

LOF DECISION:
Outlier Factor: 3.8752
DECISION: THREAT BLOCKED
REASON: Traffic behavior significantly different from local network neighborhood
ACTION: Connection blocked, anomaly flagged for investigation
----------------------------------------------------------------------

SECURITY INCIDENT #3
THREAT ANALYSIS: Web-based attack vector
Source: 192.168.50.200 -

## cell 9: Comparing both models

In [17]:
def comprehensive_model_analysis_corrected(iso_results, lof_results, attack_data):
    """
    Corrected comparative analysis accounting for threshold issues
    """
    
    print("COMPREHENSIVE MODEL COMPARISON ANALYSIS")
    print("-" * 60)
    
    # Get raw scores for proper analysis
    iso_scores = iso_results['scores']
    lof_scores = lof_results['scores']
    
    # Applying proper thresholds for fair comparison
    iso_threshold = np.percentile(iso_scores, 20)  # Bottom 20% as anomalies
    lof_threshold = np.percentile(lof_scores, 80)   # Top 20% as anomalies
    
    iso_preds_corrected = (iso_scores <= iso_threshold)
    lof_preds_corrected = (lof_scores >= lof_threshold)
    
    print(f"CORRECTED THRESHOLDS:")
    print(f"Isolation Forest: {iso_threshold:.4f} (lower scores = anomalous)")
    print(f"LOF: {lof_threshold:.4f} (higher scores = anomalous)")
    
    # 1. OVERALL PERFORMANCE COMPARISON
    iso_detection_rate = np.mean(iso_preds_corrected)
    lof_detection_rate = np.mean(lof_preds_corrected)
    
    print(f"\n1. OVERALL DETECTION RATES")
    print("-" * 30)
    print(f"Isolation Forest: {iso_detection_rate:.3f} ({iso_detection_rate*100:.1f}%)")
    print(f"LOF: {lof_detection_rate:.3f} ({lof_detection_rate*100:.1f}%)")
    
    # Agreement analysis
    agreement_rate = np.mean(iso_preds_corrected == lof_preds_corrected)
    both_detect = np.sum(iso_preds_corrected & lof_preds_corrected)
    iso_only = np.sum(iso_preds_corrected & ~lof_preds_corrected)
    lof_only = np.sum(~iso_preds_corrected & lof_preds_corrected)
    neither = np.sum(~iso_preds_corrected & ~lof_preds_corrected)
    
    print(f"\nMODEL AGREEMENT:")
    print(f"Agreement Rate: {agreement_rate:.3f} ({agreement_rate*100:.1f}%)")
    print(f"Both Detect: {both_detect}")
    print(f"ISO Only: {iso_only}")
    print(f"LOF Only: {lof_only}")
    print(f"Neither: {neither}")
    
    # 2. ATTACK TYPE EFFECTIVENESS (CORRECTED)
    print(f"\n2. EFFECTIVENESS BY ATTACK TYPE")
    print("-" * 50)
    
    attack_types = attack_data['AttackType'].unique()
    effectiveness_analysis = {}
    
    for attack_type in attack_types:
        type_mask = attack_data['AttackType'] == attack_type
        iso_type_rate = np.mean(iso_preds_corrected[type_mask])
        lof_type_rate = np.mean(lof_preds_corrected[type_mask])
        
        effectiveness_analysis[attack_type] = {
            'iso_rate': iso_type_rate,
            'lof_rate': lof_type_rate,
            'better_model': 'Isolation Forest' if iso_type_rate > lof_type_rate else 'LOF',
            'difference': abs(iso_type_rate - lof_type_rate)
        }
        
        print(f"{attack_type.replace('_', ' ').title()}:")
        print(f"  Isolation Forest: {iso_type_rate:.3f} ({iso_type_rate*100:.0f}%)")
        print(f"  LOF: {lof_type_rate:.3f} ({lof_type_rate*100:.0f}%)")
        print(f"  Better Model: {effectiveness_analysis[attack_type]['better_model']}")
        print(f"  Performance Gap: {effectiveness_analysis[attack_type]['difference']:.3f}")
    
    # 3. STATISTICAL ANALYSIS
    print(f"\n3. STATISTICAL ANALYSIS")
    print("-" * 50)
    
    print(f"Score Distribution Analysis:")
    print(f"Isolation Forest - Mean: {np.mean(iso_scores):.4f}, Std: {np.std(iso_scores):.4f}")
    print(f"LOF - Mean: {np.mean(lof_scores):.4f}, Std: {np.std(lof_scores):.4f}")
    
    # Correlation analysis
    correlation = np.corrcoef(iso_scores, lof_scores)[0,1]
    print(f"Score Correlation: {correlation:.4f}")
    
    if abs(correlation) > 0.7:
        print("- Strong correlation: Models have similar anomaly rankings")
    elif abs(correlation) > 0.3:
        print("- Moderate correlation: Some agreement in anomaly detection")
    else:
        print("- Weak correlation: Models use fundamentally different detection criteria")
    
    # 4. REALISTIC PERFORMANCE ASSESSMENT
    print(f"\n4. REALISTIC PERFORMANCE ASSESSMENT")
    print("-" * 50)
    
    # Calculate more realistic metrics
    total_attacks = len(attack_data)
    
    print(f"Dataset Characteristics:")
    print(f"Total Scenarios: {total_attacks}")
    print(f"Attack Types: {len(attack_types)}")
    print(f"All samples are attacks (synthetic test set)")
    
    print(f"\nModel Suitability Analysis:")
    if iso_detection_rate > 0.7:
        print("Isolation Forest: Good detection rate for synthetic attacks")
    elif iso_detection_rate > 0.4:
        print("Isolation Forest: Moderate detection rate, may need threshold tuning")
    else:
        print("Isolation Forest: Low detection rate, requires significant tuning")
    
    if lof_detection_rate > 0.7:
        print("LOF: Good detection rate for synthetic attacks")
    elif lof_detection_rate > 0.4:
        print("LOF: Moderate detection rate, acceptable performance")
    else:
        print("LOF: Low detection rate, may not suit this attack pattern")
    
    # 5. VISUALIZATION (Corrected)
    plt.figure(figsize=(15, 8))
    
    # Overall detection comparison
    plt.subplot(2, 3, 1)
    models = ['Isolation\nForest', 'LOF']
    rates = [iso_detection_rate, lof_detection_rate]
    colors = ['skyblue', 'lightcoral']
    bars = plt.bar(models, rates, color=colors, alpha=0.8)
    plt.ylabel('Detection Rate')
    plt.title('Corrected Detection Rates')
    plt.ylim(0, 1)
    for bar, rate in zip(bars, rates):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
                f'{rate:.2f}', ha='center', va='bottom', fontweight='bold')
    plt.grid(True, alpha=0.3)
    
    # Agreement analysis
    plt.subplot(2, 3, 2)
    categories = ['Both\nDetect', 'ISO\nOnly', 'LOF\nOnly', 'Neither']
    counts = [both_detect, iso_only, lof_only, neither]
    colors_pie = ['green', 'skyblue', 'lightcoral', 'gray']
    plt.pie(counts, labels=categories, colors=colors_pie, autopct='%1.0f', startangle=90)
    plt.title('Detection Agreement')
    
    # Score distributions
    plt.subplot(2, 3, 3)
    plt.hist(iso_scores, bins=20, alpha=0.6, label='ISO Scores', color='skyblue', density=True)
    plt.axvline(iso_threshold, color='blue', linestyle='--', label=f'ISO Threshold: {iso_threshold:.3f}')
    plt.xlabel('Isolation Forest Score')
    plt.ylabel('Density')
    plt.title('ISO Score Distribution')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # LOF scores
    plt.subplot(2, 3, 4)
    plt.hist(lof_scores, bins=20, alpha=0.6, label='LOF Scores', color='lightcoral', density=True)
    plt.axvline(lof_threshold, color='red', linestyle='--', label=f'LOF Threshold: {lof_threshold:.1f}')
    plt.xlabel('LOF Score')
    plt.ylabel('Density')
    plt.title('LOF Score Distribution')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Attack type comparison
    plt.subplot(2, 3, 5)
    attack_names = [name.replace('_', '\n').title() for name in attack_types]
    iso_rates = [effectiveness_analysis[attack]['iso_rate'] for attack in attack_types]
    lof_rates = [effectiveness_analysis[attack]['lof_rate'] for attack in attack_types]
    
    x_pos = np.arange(len(attack_types))
    width = 0.35
    plt.bar(x_pos - width/2, iso_rates, width, label='ISO Forest', alpha=0.8, color='skyblue')
    plt.bar(x_pos + width/2, lof_rates, width, label='LOF', alpha=0.8, color='lightcoral')
    plt.xlabel('Attack Type')
    plt.ylabel('Detection Rate')
    plt.title('Detection by Attack Type')
    plt.xticks(x_pos, attack_names, rotation=45, ha='right')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Score correlation
    plt.subplot(2, 3, 6)
    plt.scatter(iso_scores, lof_scores, alpha=0.6, s=30, color='purple')
    plt.xlabel('Isolation Forest Score')
    plt.ylabel('LOF Score')
    plt.title(f'Score Correlation\n(r={correlation:.3f})')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return {
        'iso_detection_rate': iso_detection_rate,
        'lof_detection_rate': lof_detection_rate,
        'agreement_rate': agreement_rate,
        'effectiveness_by_type': effectiveness_analysis,
        'correlation': correlation,
        'corrected_thresholds': {
            'iso_threshold': iso_threshold,
            'lof_threshold': lof_threshold
        }
    }

# Run corrected analysis
if 'iso_context_results' in locals() and 'lof_context_results' in locals():
    corrected_analysis = comprehensive_model_analysis_corrected(
        iso_context_results, lof_context_results, dynamic_attacks
    )
    
    print("\nCORRECTED ANALYSIS COMPLETED")
    print("Models now compared with appropriate thresholds")
else:
    print("Missing test results. Run previous cells first.")

Missing test results. Run previous cells first.


## cell 10: Save All Required Components

In [None]:
import pickle
import os

def save_everything():
    """Save all components needed for deployment"""
    
    os.makedirs('saved_models', exist_ok=True)
    
    # Check what variables exist
    required_vars = {
        'iso_forest_model': locals().get('iso_forest_model') or globals().get('iso_forest_model'),
        'lof_model': locals().get('lof_model') or globals().get('lof_model'), 
        'scaler': locals().get('scaler') or globals().get('scaler'),
        'feature_list': locals().get('feature_list') or globals().get('feature_list'),
        'optimal_threshold': locals().get('optimal_threshold') or globals().get('optimal_threshold', -0.1)
    }
    
    print("Checking available variables:")
    for name, obj in required_vars.items():
        if obj is not None:
            print(f"{name}: Available")
        else:
            print(f"{name}: Missing")
    
    # Save each component
    try:
        # 1. Save Isolation Forest
        with open('saved_models/isolation_forest_model.pkl', 'wb') as f:
            pickle.dump(required_vars['iso_forest_model'], f)
        print("Isolation Forest saved")
        
        # 2. Save LOF  
        with open('saved_models/lof_model.pkl', 'wb') as f:
            pickle.dump(required_vars['lof_model'], f)
        print("LOF model saved")
        
        # 3. Save Scaler
        with open('saved_models/feature_scaler.pkl', 'wb') as f:
            pickle.dump(required_vars['scaler'], f) 
        print("Feature scaler saved")
        
        # 4. Save metadata
        metadata = {
            'feature_list': required_vars['feature_list'],
            'iso_threshold': required_vars['optimal_threshold'],
            'lof_threshold': 1.5,  # Standard LOF threshold
            'n_features': len(required_vars['feature_list'])
        }
        
        with open('saved_models/model_metadata.pkl', 'wb') as f:
            pickle.dump(metadata, f)
        print("Metadata saved")
        
        print(f"\nAll models saved successfully!")
        return True
        
    except Exception as e:
        print(f"Error saving models: {e}")
        return False

# Run the save function
save_everything()

Checking available variables:
iso_forest_model: Available
lof_model: Available
scaler: Available
feature_list: Available
optimal_threshold: Available
Isolation Forest saved
LOF model saved
Feature scaler saved
Metadata saved

All models saved successfully!


True

# cell 11: Verify Saved Models

In [None]:
import pickle

def verify_saved_models():
    """Verify all models can be loaded and work"""
    
    try:
        # Load everything
        with open('saved_models/isolation_forest_model.pkl', 'rb') as f:
            loaded_iso = pickle.load(f)
            
        with open('saved_models/lof_model.pkl', 'rb') as f:
            loaded_lof = pickle.load(f)
            
        with open('saved_models/feature_scaler.pkl', 'rb') as f:
            loaded_scaler = pickle.load(f)
            
        with open('saved_models/model_metadata.pkl', 'rb') as f:
            loaded_metadata = pickle.load(f)
        
        print("All files loaded successfully")
        print(f"Feature count: {loaded_metadata['n_features']}")
        print(f"ISO threshold: {loaded_metadata['iso_threshold']}")
        print(f"LOF threshold: {loaded_metadata['lof_threshold']}")
        
        # Test with sample data
        sample_features = [[50, 45, 5000, 4500, 10.0, 1000, 0.1, 0.01, 0.001, 0.9, 0.9, 100, 100, 1, 0, 0]]
        
        # Test scaling
        scaled_features = loaded_scaler.transform(sample_features)
        
        # Test predictions
        iso_score = loaded_iso.decision_function(scaled_features)[0]
        lof_score = -loaded_lof.score_samples(scaled_features)[0]
        
        print(f"Test prediction - ISO: {iso_score:.3f}, LOF: {lof_score:.3f}")
        print("All models working correctly!")
        
        return True
        
    except Exception as e:
        print(f"Error verifying models: {e}")
        return False

verify_saved_models()

All files loaded successfully
Feature count: 16
ISO threshold: 0.022451200840145374
LOF threshold: 1.5
Test prediction - ISO: -0.015, LOF: 4.896
All models working correctly!


True