# ðŸ§  Model Training Notebook

Train and evaluate models for network anomaly detection.

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import warnings
warnings.filterwarnings('ignore')

from src.data import DataLoader, Preprocessor
from src.models import BaselineModels, AnomalyDetector

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

## 1. Load and Preprocess Data

In [None]:
# Load data
loader = DataLoader('../config.yaml')
preprocessor = Preprocessor('../config.yaml')

# Try to load data
try:
    df = loader.load_cicids2017(sample_ratio=0.1)  # Use 10% for quick training
except Exception as e:
    print(f'Error loading data: {e}')
    print('Using sample data...')
    df = pd.read_csv('../data/raw/sample_data.csv')

In [None]:
# Preprocess
df = preprocessor.clean_data(df)
df = preprocessor.encode_labels(df)

# Prepare features
X, y = preprocessor.prepare_features(df)
X = preprocessor.scale_features(X)

print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

In [None]:
# Split data
X_train, X_val, X_test, y_train, y_val, y_test = preprocessor.split_data(X, y)

## 2. Handle Class Imbalance

In [None]:
# Apply SMOTE
X_train_balanced, y_train_balanced = preprocessor.handle_imbalance(X_train, y_train, method='smote')

print(f'Before SMOTE: {len(X_train)}')
print(f'After SMOTE: {len(X_train_balanced)}')

## 3. Train XGBoost Model

In [None]:
# Initialize models
models = BaselineModels('../config.yaml')

# Train XGBoost
models.train('xgboost', X_train_balanced, y_train_balanced, X_val, y_val)

In [None]:
# Evaluate
class_names = list(preprocessor.label_encoder.classes_)
results = models.evaluate('xgboost', X_test, y_test, class_names)

## 4. Confusion Matrix

In [None]:
# Plot confusion matrix
cm = results['confusion_matrix']

plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix - XGBoost')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

## 5. Feature Importance

In [None]:
# Get feature importance
importance = models.get_feature_importance('xgboost')
feature_names = preprocessor.feature_columns

# Sort by importance
indices = np.argsort(importance)[-20:]  # Top 20

plt.figure(figsize=(10, 8))
plt.barh(range(len(indices)), importance[indices], color='steelblue')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Feature Importance')
plt.title('Top 20 Most Important Features')
plt.tight_layout()
plt.show()

## 6. Train Random Forest (Comparison)

In [None]:
# Train Random Forest
models.train('random_forest', X_train_balanced, y_train_balanced)
rf_results = models.evaluate('random_forest', X_test, y_test, class_names)

## 7. Model Comparison

In [None]:
# Compare models
comparison = pd.DataFrame({
    'Model': ['XGBoost', 'Random Forest'],
    'Accuracy': [results['accuracy'], rf_results['accuracy']],
    'F1 Score': [results['f1_score'], rf_results['f1_score']]
})

print('Model Comparison:')
print(comparison.to_string(index=False))

In [None]:
# Visualize comparison
fig, ax = plt.subplots(figsize=(8, 5))
x = np.arange(len(comparison))
width = 0.35

bars1 = ax.bar(x - width/2, comparison['Accuracy'], width, label='Accuracy', color='steelblue')
bars2 = ax.bar(x + width/2, comparison['F1 Score'], width, label='F1 Score', color='coral')

ax.set_xlabel('Model')
ax.set_ylabel('Score')
ax.set_title('Model Comparison')
ax.set_xticks(x)
ax.set_xticklabels(comparison['Model'])
ax.legend()
ax.set_ylim([0, 1])

plt.tight_layout()
plt.show()

## 8. Save Best Model

In [None]:
# Save the best model
best_model = 'xgboost' if results['f1_score'] > rf_results['f1_score'] else 'random_forest'
print(f'Best model: {best_model}')

models.save_model(best_model, '../models')
preprocessor.save_artifacts('../models')