In [None]:
# =============================
# Step 0: Import Required Libraries
# =============================
import os
import glob
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [None]:
# =============================
# Step 1: Data Ingestion - Combine Multiple CSV Files (if applicable)
# =============================
# Set the directory containing your CSV files (change the path as needed)
data_dir = "path/to/cicids_csvs"  # Update with your directory path

# Use glob to create a list of CSV file paths
csv_files = glob.glob(os.path.join(data_dir, "*.csv"))

# Read and combine CSV files into one DataFrame
df_list = []
for file in csv_files:
    temp_df = pd.read_csv(file)
    # Optional: add a column to indicate the source file or attack type, if needed
    df_list.append(temp_df)

# If you have multiple CSVs, concatenate them; otherwise, read a single CSV.
if len(df_list) > 1:
    df = pd.concat(df_list, ignore_index=True)
else:
    df = df_list[0]

print("Combined dataset shape:", df.shape)


In [None]:
# =============================
# Step 2: Data Preprocessing
# =============================

# 2.1 Drop Irrelevant Columns (adjust as per your dataset)
columns_to_drop = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp']
df.drop(columns=columns_to_drop, axis=1, inplace=True, errors='ignore')

# 2.2 Handle Missing Values
# Fill missing values for numeric columns only
numeric_medians = df.select_dtypes(include=[np.number]).median()
df.fillna(numeric_medians, inplace=True)

# 2.3 Encode the Label Column
# Ensure the label column name is correct (here assumed to be 'Label')
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])

# 2.4 Separate Features and Labels
X = df.drop('Label', axis=1)
y = df['Label']

# 2.5 Feature Scaling on Raw Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 2.6 Balance the Dataset using SMOTE (if needed)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 2.7 Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, 
                                                    test_size=0.2, random_state=42)
print("Preprocessing complete. Training samples:", X_train.shape[0], 
      "Test samples:", X_test.shape[0])

In [None]:
# =============================
# Step 3: DL Model for Feature Extraction using Autoencoder
# =============================

# Define Autoencoder architecture
input_dim = X_train.shape[1]
encoding_dim = 32  # Dimension of the latent (encoded) space

# Build the autoencoder model
input_layer = Input(shape=(input_dim,))
# Encoder part
encoded = Dense(64, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
# Decoder part
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')

print("Training Autoencoder for Feature Extraction...")
autoencoder.fit(X_train, X_train, 
                epochs=20, 
                batch_size=32, 
                validation_split=0.1, 
                verbose=1)

# Extract encoder model to generate latent features
encoder = Model(inputs=input_layer, outputs=encoded)

# Generate latent (encoded) features for training and testing data
X_train_encoded = encoder.predict(X_train)
X_test_encoded = encoder.predict(X_test)

print("Encoded training features shape:", X_train_encoded.shape)
print("Encoded testing features shape:", X_test_encoded.shape)

In [None]:
# =============================
# Step 4: ML Classifier on Encoded Features
# =============================

# Train a Random Forest classifier using the latent features
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_encoded, y_train)

# Predict on test set
y_pred_rf = rf_clf.predict(X_test_encoded)

In [None]:
# =============================
# Step 5: Evaluation
# =============================

print("\nRandom Forest Classification Report on Encoded Features:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))