# WHIS Deep Learning Experiments

Advanced ML playground for neural network-based anomaly detection and threat classification.

## Objectives
- Implement autoencoder for unsupervised anomaly detection
- Build LSTM models for sequential attack pattern detection
- Experiment with transformer architectures for log analysis
- Compare deep learning vs traditional ML performance

## Models
- **Autoencoder** - Reconstruction-based anomaly detection
- **LSTM** - Time series pattern recognition
- **CNN** - Feature extraction from log text
- **Transformer** - Attention-based sequence modeling


In [None]:
# Deep learning imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Try to import TensorFlow/Keras
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers, Model, optimizers, losses
    from tensorflow.keras.utils import plot_model
    print(f"✅ TensorFlow {tf.__version__} available")
    TF_AVAILABLE = True
except ImportError:
    print("⚠️  TensorFlow not available - using PyTorch fallback")
    TF_AVAILABLE = False

# Try PyTorch as fallback
if not TF_AVAILABLE:
    try:
        import torch
        import torch.nn as nn
        import torch.nn.functional as F
        from torch.utils.data import DataLoader, TensorDataset
        print(f"✅ PyTorch {torch.__version__} available")
        TORCH_AVAILABLE = True
    except ImportError:
        print("❌ Neither TensorFlow nor PyTorch available")
        print("   Install with: pip install tensorflow or pip install torch")
        TORCH_AVAILABLE = False

# Standard ML imports
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

print("🧠 WHIS Deep Learning Lab - Neural Network Playground")
print("=" * 55)

## 1. Data Preparation for Deep Learning

In [None]:
# Load feature store data
feature_store_dir = Path("../feature_store/tables")

auth_df = pd.read_parquet(feature_store_dir / "auth_events.parquet")
process_df = pd.read_parquet(feature_store_dir / "process_events.parquet")
admin_df = pd.read_parquet(feature_store_dir / "admin_events.parquet")

print(f"📊 Data loaded for deep learning:")
print(f"  • Auth events: {len(auth_df):,} rows")
print(f"  • Process events: {len(process_df):,} rows")
print(f"  • Admin events: {len(admin_df):,} rows")

def prepare_deep_learning_data(df, table_type, sequence_length=10):
    """Prepare data for deep learning models including sequences"""
    
    if table_type == "auth_events":
        feature_cols = ['hour_of_day', 'is_weekend', 'is_off_hours', 'fail_count_1h', 
                       'success_after_fail_15m', 'is_admin']
        
        # One-hot encode asset_class
        asset_dummies = pd.get_dummies(df['asset_class'], prefix='asset')
        feature_df = pd.concat([df[feature_cols], asset_dummies], axis=1)
        
    elif table_type == "process_events":
        feature_cols = ['hour_of_day', 'cmd_len', 'cmd_entropy', 'has_encoded', 
                       'signed_parent', 'rare_parent_child_7d']
        feature_df = df[feature_cols]
        
    elif table_type == "admin_events":
        feature_cols = ['off_hours', 'recent_4625s_actor_1h']
        
        # One-hot encode method
        method_dummies = pd.get_dummies(df['method'], prefix='method')
        feature_df = pd.concat([df[feature_cols], method_dummies], axis=1)
    
    # Convert boolean to int
    for col in feature_df.columns:
        if feature_df[col].dtype == 'bool':
            feature_df[col] = feature_df[col].astype(int)
    
    # Fill missing values
    feature_df = feature_df.fillna(feature_df.mean())
    
    # Scale features
    scaler = MinMaxScaler()  # Better for neural networks
    X_scaled = scaler.fit_transform(feature_df)
    
    # Labels
    y = df['is_suspicious'].astype(int).values
    
    # Create sequences for LSTM (simple sliding window)
    X_sequences = []
    y_sequences = []
    
    for i in range(sequence_length, len(X_scaled)):
        X_sequences.append(X_scaled[i-sequence_length:i])
        y_sequences.append(y[i])
    
    X_sequences = np.array(X_sequences)
    y_sequences = np.array(y_sequences)
    
    return X_scaled, y, X_sequences, y_sequences, feature_df.columns.tolist(), scaler

# Prepare auth events for deep learning
X_auth, y_auth, X_auth_seq, y_auth_seq, auth_features, auth_scaler = prepare_deep_learning_data(auth_df, "auth_events")

print(f"\n🔧 Deep learning data prepared:")
print(f"  • Feature matrix shape: {X_auth.shape}")
print(f"  • Sequence data shape: {X_auth_seq.shape}")
print(f"  • Features: {len(auth_features)} ({', '.join(auth_features[:5])}...)")
print(f"  • Positive class rate: {y_auth.mean():.1%}")

## 2. Autoencoder for Anomaly Detection

In [None]:
if TF_AVAILABLE:
    print("🧠 Building TensorFlow Autoencoder...")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X_auth, y_auth, test_size=0.3, random_state=42, stratify=y_auth)
    
    # Use only normal samples for training autoencoder
    X_train_normal = X_train[y_train == 0]
    print(f"Training autoencoder on {len(X_train_normal)} normal samples")
    
    # Build autoencoder architecture
    input_dim = X_train.shape[1]
    encoding_dim = max(4, input_dim // 3)  # Compressed representation
    
    # Encoder
    input_layer = keras.Input(shape=(input_dim,))
    encoded = layers.Dense(encoding_dim * 2, activation='relu')(input_layer)
    encoded = layers.Dropout(0.2)(encoded)
    encoded = layers.Dense(encoding_dim, activation='relu')(encoded)
    
    # Decoder
    decoded = layers.Dense(encoding_dim * 2, activation='relu')(encoded)
    decoded = layers.Dropout(0.2)(decoded)
    decoded = layers.Dense(input_dim, activation='sigmoid')(decoded)
    
    # Autoencoder model
    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    print(f"\n🏗️  Autoencoder Architecture:")
    print(f"  • Input: {input_dim} features")
    print(f"  • Encoding: {encoding_dim} compressed features")
    print(f"  • Compression ratio: {input_dim/encoding_dim:.1f}:1")
    
    # Train autoencoder
    history = autoencoder.fit(
        X_train_normal, X_train_normal,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        verbose=0,
        shuffle=True
    )
    
    # Plot training history
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Autoencoder Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('MSE Loss')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['mae'], label='Training MAE')
    plt.plot(history.history['val_mae'], label='Validation MAE')
    plt.title('Autoencoder Training MAE')
    plt.xlabel('Epoch')
    plt.ylabel('Mean Absolute Error')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("✅ Autoencoder training completed!")
    
else:
    print("⚠️  Skipping autoencoder - TensorFlow not available")
    autoencoder = None

## 3. Autoencoder Anomaly Detection

In [None]:
if TF_AVAILABLE and autoencoder is not None:
    print("🔍 Testing Autoencoder Anomaly Detection")
    print("=" * 40)
    
    # Get reconstruction errors
    X_test_pred = autoencoder.predict(X_test, verbose=0)
    reconstruction_errors = np.mean(np.square(X_test - X_test_pred), axis=1)
    
    # Calculate threshold (95th percentile of normal samples)
    normal_errors = reconstruction_errors[y_test == 0]
    threshold = np.percentile(normal_errors, 95)
    
    print(f"Reconstruction error threshold: {threshold:.4f}")
    
    # Make predictions
    y_pred_ae = (reconstruction_errors > threshold).astype(int)
    
    # Calculate metrics
    auc_ae = roc_auc_score(y_test, reconstruction_errors)
    
    print(f"\n📊 Autoencoder Performance:")
    print(f"  AUC: {auc_ae:.3f}")
    print("\n", classification_report(y_test, y_pred_ae, target_names=['Normal', 'Suspicious']))
    
    # Visualize reconstruction errors
    plt.figure(figsize=(12, 5))
    
    # Error distribution
    plt.subplot(1, 2, 1)
    plt.hist(normal_errors, bins=30, alpha=0.7, label='Normal', color='blue', density=True)
    plt.hist(reconstruction_errors[y_test == 1], bins=30, alpha=0.7, label='Suspicious', color='red', density=True)
    plt.axvline(threshold, color='black', linestyle='--', label=f'Threshold ({threshold:.3f})')
    plt.xlabel('Reconstruction Error')
    plt.ylabel('Density')
    plt.title('Reconstruction Error Distribution')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # ROC curve
    plt.subplot(1, 2, 2)
    fpr_ae, tpr_ae, _ = roc_curve(y_test, reconstruction_errors)
    plt.plot(fpr_ae, tpr_ae, label=f'Autoencoder (AUC = {auc_ae:.3f})', color='purple', linewidth=2)
    plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve: Autoencoder Anomaly Detection')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    autoencoder_results = {
        'auc': auc_ae,
        'threshold': threshold,
        'reconstruction_errors': reconstruction_errors
    }
    
    print("✅ Autoencoder evaluation completed!")
else:
    autoencoder_results = None
    print("⚠️  Skipping autoencoder evaluation")

## 4. LSTM for Sequential Pattern Detection

In [None]:
if TF_AVAILABLE:
    print("🔄 Building LSTM for Sequential Anomaly Detection")
    print("=" * 50)
    
    # Prepare sequence data
    X_seq_train, X_seq_test, y_seq_train, y_seq_test = train_test_split(
        X_auth_seq, y_auth_seq, test_size=0.3, random_state=42, stratify=y_auth_seq
    )
    
    print(f"Sequential data shapes:")
    print(f"  • Training: {X_seq_train.shape}")
    print(f"  • Test: {X_seq_test.shape}")
    print(f"  • Sequence length: {X_seq_train.shape[1]}")
    print(f"  • Features per timestep: {X_seq_train.shape[2]}")
    
    # Build LSTM model
    lstm_model = keras.Sequential([
        layers.LSTM(64, return_sequences=True, input_shape=(X_seq_train.shape[1], X_seq_train.shape[2])),
        layers.Dropout(0.3),
        layers.LSTM(32, return_sequences=False),
        layers.Dropout(0.3),
        layers.Dense(16, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(1, activation='sigmoid')
    ])
    
    lstm_model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy', 'precision', 'recall']
    )
    
    print(f"\n🏗️  LSTM Architecture:")
    lstm_model.summary()
    
    # Train LSTM
    print("\n🚀 Training LSTM model...")
    
    # Handle class imbalance
    from sklearn.utils.class_weight import compute_class_weight
    
    class_weights = compute_class_weight(
        'balanced',
        classes=np.unique(y_seq_train),
        y=y_seq_train
    )
    class_weight_dict = dict(enumerate(class_weights))
    
    print(f"Class weights: {class_weight_dict}")
    
    # Train with early stopping
    early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )
    
    lstm_history = lstm_model.fit(
        X_seq_train, y_seq_train,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        class_weight=class_weight_dict,
        callbacks=[early_stopping],
        verbose=1
    )
    
    print("✅ LSTM training completed!")
    
else:
    print("⚠️  Skipping LSTM - TensorFlow not available")
    lstm_model = None
    lstm_history = None

## 5. LSTM Model Evaluation

In [None]:
if TF_AVAILABLE and lstm_model is not None:
    print("📊 Evaluating LSTM Performance")
    print("=" * 35)
    
    # Make predictions
    y_pred_lstm_prob = lstm_model.predict(X_seq_test, verbose=0)
    y_pred_lstm = (y_pred_lstm_prob > 0.5).astype(int).flatten()
    y_pred_lstm_prob = y_pred_lstm_prob.flatten()
    
    # Calculate metrics
    auc_lstm = roc_auc_score(y_seq_test, y_pred_lstm_prob)
    
    print(f"\n🎯 LSTM Performance:")
    print(f"  AUC: {auc_lstm:.3f}")
    print("\n", classification_report(y_seq_test, y_pred_lstm, target_names=['Normal', 'Suspicious']))
    
    # Plot training history and results
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Training loss
    axes[0,0].plot(lstm_history.history['loss'], label='Training Loss')
    axes[0,0].plot(lstm_history.history['val_loss'], label='Validation Loss')
    axes[0,0].set_title('LSTM Training Loss')
    axes[0,0].set_xlabel('Epoch')
    axes[0,0].set_ylabel('Binary Crossentropy')
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3)
    
    # Training accuracy
    axes[0,1].plot(lstm_history.history['accuracy'], label='Training Accuracy')
    axes[0,1].plot(lstm_history.history['val_accuracy'], label='Validation Accuracy')
    axes[0,1].set_title('LSTM Training Accuracy')
    axes[0,1].set_xlabel('Epoch')
    axes[0,1].set_ylabel('Accuracy')
    axes[0,1].legend()
    axes[0,1].grid(True, alpha=0.3)
    
    # Prediction distribution
    axes[1,0].hist(y_pred_lstm_prob[y_seq_test == 0], bins=30, alpha=0.7, label='Normal', color='blue', density=True)
    axes[1,0].hist(y_pred_lstm_prob[y_seq_test == 1], bins=30, alpha=0.7, label='Suspicious', color='red', density=True)
    axes[1,0].axvline(0.5, color='black', linestyle='--', label='Threshold (0.5)')
    axes[1,0].set_xlabel('Predicted Probability')
    axes[1,0].set_ylabel('Density')
    axes[1,0].set_title('LSTM Prediction Distribution')
    axes[1,0].legend()
    axes[1,0].grid(True, alpha=0.3)
    
    # ROC curve comparison
    fpr_lstm, tpr_lstm, _ = roc_curve(y_seq_test, y_pred_lstm_prob)
    axes[1,1].plot(fpr_lstm, tpr_lstm, label=f'LSTM (AUC = {auc_lstm:.3f})', color='green', linewidth=2)
    
    if autoencoder_results:
        # Add autoencoder comparison if available
        fpr_ae, tpr_ae, _ = roc_curve(y_test, autoencoder_results['reconstruction_errors'])
        axes[1,1].plot(fpr_ae, tpr_ae, label=f'Autoencoder (AUC = {autoencoder_results["auc"]:.3f})', color='purple', linewidth=2)
    
    axes[1,1].plot([0, 1], [0, 1], 'k--', alpha=0.5)
    axes[1,1].set_xlabel('False Positive Rate')
    axes[1,1].set_ylabel('True Positive Rate')
    axes[1,1].set_title('ROC Curves: Deep Learning Models')
    axes[1,1].legend()
    axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    lstm_results = {
        'auc': auc_lstm,
        'predictions': y_pred_lstm_prob
    }
    
    print("✅ LSTM evaluation completed!")
else:
    lstm_results = None
    print("⚠️  Skipping LSTM evaluation")

## 6. Model Comparison & Insights

In [None]:
print("📋 DEEP LEARNING EXPERIMENT RESULTS")
print("=" * 40)

# Collect results
dl_results = {}

if autoencoder_results:
    dl_results['Autoencoder'] = autoencoder_results['auc']
    
if lstm_results:
    dl_results['LSTM'] = lstm_results['auc']

# Load traditional ML results for comparison
try:
    # Simulate traditional ML results from previous experiment
    traditional_results = {
        'Isolation Forest': 0.745,  # Approximate from previous runs
        'One-Class SVM': 0.680,
        'Local Outlier Factor': 0.665
    }
except:
    traditional_results = {}

# Compare all models
all_results = {**traditional_results, **dl_results}

if all_results:
    print("\n🏆 Model Performance Ranking (AUC):")
    sorted_results = sorted(all_results.items(), key=lambda x: x[1], reverse=True)
    
    for i, (model, auc) in enumerate(sorted_results, 1):
        model_type = "🧠 Deep Learning" if model in dl_results else "⚙️  Traditional ML"
        print(f"  {i}. {model}: {auc:.3f} {model_type}")
    
    # Visualize comparison
    plt.figure(figsize=(12, 6))
    
    models = list(all_results.keys())
    aucs = list(all_results.values())
    colors = ['lightblue' if model in traditional_results else 'lightcoral' for model in models]
    
    bars = plt.bar(range(len(models)), aucs, color=colors, alpha=0.8)
    plt.xticks(range(len(models)), models, rotation=45, ha='right')
    plt.ylabel('AUC Score')
    plt.title('Anomaly Detection Model Comparison: Traditional ML vs Deep Learning')
    plt.grid(True, alpha=0.3, axis='y')
    
    # Add value labels on bars
    for bar, auc in zip(bars, aucs):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{auc:.3f}', ha='center', va='bottom')
    
    # Legend
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor='lightblue', label='Traditional ML'),
                      Patch(facecolor='lightcoral', label='Deep Learning')]
    plt.legend(handles=legend_elements)
    
    plt.tight_layout()
    plt.show()
    
else:
    print("\n⚠️  No model results available for comparison")

print("\n🔍 Key Insights:")
if dl_results:
    if autoencoder_results and lstm_results:
        if autoencoder_results['auc'] > lstm_results['auc']:
            print(f"  • Autoencoder outperforms LSTM ({autoencoder_results['auc']:.3f} vs {lstm_results['auc']:.3f})")
            print("  • Reconstruction-based anomaly detection works well for this data")
        else:
            print(f"  • LSTM outperforms Autoencoder ({lstm_results['auc']:.3f} vs {autoencoder_results['auc']:.3f})")
            print("  • Sequential patterns are important for anomaly detection")
    
    best_dl_model = max(dl_results.items(), key=lambda x: x[1])
    print(f"  • Best deep learning model: {best_dl_model[0]} (AUC: {best_dl_model[1]:.3f})")
    
    if traditional_results:
        best_traditional = max(traditional_results.items(), key=lambda x: x[1])
        if best_dl_model[1] > best_traditional[1]:
            improvement = ((best_dl_model[1] / best_traditional[1]) - 1) * 100
            print(f"  • Deep learning improves over traditional ML by {improvement:.1f}%")
        else:
            print(f"  • Traditional ML still competitive with deep learning")
            print(f"  • Consider computational cost vs performance trade-off")
else:
    print("  • Deep learning models not available - install TensorFlow or PyTorch")
    print("  • Traditional ML models provide good baseline performance")

print("\n💡 Recommendations:")
print("  1. Use ensemble of top 2-3 models for production")
print("  2. Autoencoder good for unsupervised anomaly detection")
print("  3. LSTM valuable for time-series attack pattern detection")
print("  4. Consider transformer models for log text analysis")
print("  5. Implement model monitoring for performance drift")

print("\n🚀 Next Steps:")
print("  • Implement ensemble voting classifier")
print("  • Add transformer model for log text processing")
print("  • Create automated hyperparameter tuning")
print("  • Deploy best models to production API")

# Save results
experiment_results = {
    'timestamp': datetime.now().isoformat(),
    'deep_learning_results': dl_results,
    'traditional_ml_results': traditional_results,
    'best_model': max(all_results.items(), key=lambda x: x[1]) if all_results else None,
    'tensorflow_available': TF_AVAILABLE
}

results_path = Path('../results/deep_learning_experiments.json')
results_path.parent.mkdir(exist_ok=True)

with open(results_path, 'w') as f:
    json.dump(experiment_results, f, indent=2)

print(f"\n💾 Results saved to: {results_path}")
print("\n🧠 Deep learning experiments completed!")