<a href="https://colab.research.google.com/github/hananbahtiti/Hybrid-Intrusion-detection-Systems/blob/main/Host_IDS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ========== IMPORT LIBRARIES ==========
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Scikit-learn for preprocessing and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, classification_report, confusion_matrix

# TensorFlow and Keras for deep learning
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Suppress TensorFlow warnings for cleaner output
tf.get_logger().setLevel('ERROR')

# ========== SETTINGS ==========
# Base directory for saving models and outputs
BASIC_FOLDER = '/content/drive/MyDrive/hybrid_IDS/host'

# Dataset path (must include 'Label' column)
DATA_PATH = f'/content/drive/MyDrive/hybrid_IDS/dataset/all/balanced_attack.csv'

# Output paths
OUTPUT_CSV = f'{BASIC_FOLDER}/model/anomaly_detection_results.csv'
MODEL_PATH = f'{BASIC_FOLDER}/model/autoencoder_model.keras'
PLOT_DIR = f'{BASIC_FOLDER}/model/'
os.makedirs(PLOT_DIR, exist_ok=True)  # Create output directory if it doesn't exist

# ========== LOAD & PREPARE DATA ==========
# Load dataset
data = pd.read_csv(DATA_PATH)

# Split into features (X) and labels (y)
X = data.drop(columns=['Label'])
y = data['Label']

# Normalize features for better training performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Use only 'normal' samples (label == 0) for training the autoencoder
X_normal = X_scaled[y == 0]

# Split normal data into training and validation sets
X_train, X_val = train_test_split(X_normal, test_size=0.2, random_state=42)

# ========== AUTOENCODER MODEL ==========
# Define input shape
input_dim = X_train.shape[1]

# Build the autoencoder architecture
input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(32, activation='relu')(encoded)
encoded = Dense(16, activation='relu')(encoded)

decoded = Dense(32, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)  # Linear activation to reconstruct input

# Compile the autoencoder model
autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')  # Mean Squared Error for reconstruction

# Define callbacks: early stopping and model checkpointing
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(MODEL_PATH, save_best_only=True, monitor='val_loss', verbose=1)

# ========== TRAINING ==========
# Train the autoencoder using only normal samples
history = autoencoder.fit(
    X_train, X_train,
    epochs=100,
    batch_size=32,
    validation_data=(X_val, X_val),
    shuffle=True,
    verbose=1,
    callbacks=[early_stop, model_checkpoint]
)
print(f"[✓] Model saved to: {MODEL_PATH}")




Epoch 1/100
[1m144185/225000[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m3:04[0m 2ms/step - loss: 0.0032

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ========== PREDICTION ==========
# Reconstruct all samples (both normal and anomalous)
X_pred = autoencoder.predict(X_scaled)

# Calculate reconstruction error (Mean Squared Error per sample)
mse = np.mean(np.power(X_pred - X_scaled, 2), axis=1)

# Define anomaly threshold based on 95th percentile of normal reconstruction errors
threshold = np.percentile(mse[y == 0], 95)

# Classify as anomaly if error exceeds threshold
predictions = (mse > threshold).astype(int)

# ========== SAVE PREDICTIONS ==========
# Append reconstruction error and predictions to the original dataset
output_df = data.copy()
output_df['reconstruction_error'] = mse
output_df['predicted'] = predictions

# Save results to CSV
output_df.to_csv(OUTPUT_CSV, index=False)
print(f"[✓] Predictions saved to: {OUTPUT_CSV}")

In [None]:
# ========== METRICS ==========
# Ground truth labels
y_true = y.values

# Evaluate model using AUC and F1 Score
auc = roc_auc_score(y_true, mse)  # Continuous score-based metric
f1 = f1_score(y_true, predictions)  # Binary classification metric

# Print evaluation results
print(f"AUC Score: {auc:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_true, predictions))

# ========== CONFUSION MATRIX HEATMAP ==========
# Generate and plot confusion matrix
conf_mat = confusion_matrix(y_true, predictions)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')

# Save confusion matrix plot
conf_path = os.path.join(PLOT_DIR, 'confusion_matrix.png')
plt.savefig(conf_path)
print(f"[✓] Confusion matrix plot saved to: {conf_path}")
plt.show()
plt.close()

# ========== PLOT TRAINING LOSS ==========
# Plot training and validation loss over epochs
plt.figure(figsize=(8, 4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Autoencoder Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save loss plot
loss_path = os.path.join(PLOT_DIR, 'training_loss.png')
plt.savefig(loss_path)
print(f"[✓] Loss plot saved to: {loss_path}")
plt.show()
plt.close()

# ========== PLOT RECONSTRUCTION ERROR DISTRIBUTION ==========
# Visualize how reconstruction errors are distributed for normal vs anomalous samples
error_df = pd.DataFrame({'reconstruction_error': mse, 'true_label': y_true})

plt.figure(figsize=(8, 4))
sns.histplot(error_df[error_df['true_label'] == 0]['reconstruction_error'],
             bins=50, color='blue', label='Normal', stat='density')
sns.histplot(error_df[error_df['true_label'] == 1]['reconstruction_error'],
             bins=50, color='red', label='Attack', stat='density')
plt.axvline(threshold, color='black', linestyle='--', label='Threshold')
plt.title('Reconstruction Error Distribution')
plt.xlabel('Reconstruction Error')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save error distribution plot
error_path = os.path.join(PLOT_DIR, 'reconstruction_error_distribution.png')
plt.savefig(error_path)
print(f"[✓] Error distribution plot saved to: {error_path}")
plt.show()
plt.close()