In [8]:
import pandas as pd
import numpy as np
from pathlib import Path
import os, glob
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, Conv1D, MaxPooling1D, Flatten, LSTM, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
def load_dataset_from_structure(root_path):
    data = []
    for file in root_path.glob('*/*/*.csv'):
        try:
            df = pd.read_csv(file)

            category = file.parents[1].name  # DDoS, DoS, etc.
            attack = file.parent.name        # DDoS ICMP, DoS TCP, etc.
            label_class = 'Benign' if category.upper() == 'BENIGN' else 'Attack'

            df['category'] = category
            df['attack'] = attack
            df['class'] = label_class

            data.append(df)
        except Exception as e:
            print(f"[ERROR] Failed to read file {file}: {e}")
    return pd.concat(data, ignore_index=True)

# Load train
train_root = Path('../../../Data/CICIoMT2024/train')
train_df = load_dataset_from_structure(train_root)

# Load test
test_root = Path('../../../Data/CICIoMT2024/test')
test_df = load_dataset_from_structure(test_root)

# Cek ringkasan
print("Train set:", train_df.shape)
print(train_df[['category', 'attack', 'class']].value_counts())
print("\nTest set:", test_df.shape)
print(test_df[['category', 'attack', 'class']].value_counts())

# Gabungkan train dan test menjadi satu DataFrame
df = pd.concat([train_df, test_df], ignore_index=True)

Train set: (7160831, 48)
category  attack              class 
DDoS      DDoS UDP            Attack    1635956
          DDoS ICMP           Attack    1537476
          DDoS TCP            Attack     804465
          DDoS SYN            Attack     801962
DoS       DoS UDP             Attack     566950
          DoS SYN             Attack     441903
          DoS ICMP            Attack     416292
          DoS TCP             Attack     380384
BENIGN    BENIGN              Benign     192732
MQTT      DDoS Connect Flood  Attack     173036
RECON     Port Scan           Attack      83981
MQTT      DoS Publish Flood   Attack      44376
          DDoS Publish Flood  Attack      27623
RECON     OS Scan             Attack      16832
SPOOFING  SPOOFING            Attack      16047
MQTT      DoS Connect Flood   Attack      12773
          Malformed Data      Attack       5130
RECON     Recon VulScan       Attack       2173
          Ping Sweep          Attack        740
Name: count, dtype: int64


In [22]:
# Encode label
le = LabelEncoder()
y_encoded = le.fit_transform(df['class'])
y_categorical = to_categorical(y_encoded)

# Ambil fitur numerik dan scaling
X = df.drop(['class', 'category', 'attack'], axis=1, errors='ignore').select_dtypes(include=[np.number])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Siapkan data untuk CNN/LSTM (reshape ke 3D: samples, timesteps, features_per_step)
timesteps = 3
features_per_step = X_scaled.shape[1] // timesteps
X_seq = X_scaled[:, :timesteps * features_per_step].reshape((-1, timesteps, features_per_step))


In [24]:
# Split data
X_train_ann, X_test_ann, y_train, y_test = train_test_split(X_scaled, y_categorical, test_size=0.2, random_state=42)
X_train_seq, X_test_seq, _, _ = train_test_split(X_seq, y_categorical, test_size=0.2, random_state=42)

In [26]:
# Model ANN
ann = Sequential([
    Input(shape=(X_train_ann.shape[1],)),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(y_categorical.shape[1], activation='softmax')
])
ann.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model CNN
cnn = Sequential([
    Input(shape=(timesteps, features_per_step)),
    Conv1D(64, kernel_size=2, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(y_categorical.shape[1], activation='softmax')
])
cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model LSTM
lstm = Sequential([
    Input(shape=(timesteps, features_per_step)),
    LSTM(64),
    Dense(y_categorical.shape[1], activation='softmax')
])
lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model DNN (lebih dalam dari ANN)
dnn = Sequential([
    Input(shape=(X_train_ann.shape[1],)),
    Dense(512, activation='relu'),
    Dropout(0.4),
    Dense(256, activation='relu'),
    Dropout(0.4),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(y_categorical.shape[1], activation='softmax')
])
dnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model MLP
mlp = Sequential([
    Input(shape=(X_train_ann.shape[1],)),
    Dense(256, activation='relu'),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(y_categorical.shape[1], activation='softmax')
])
mlp.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model BiLSTM
bilstm = Sequential([
    Input(shape=(timesteps, features_per_step)),
    Bidirectional(LSTM(64)),
    Dense(y_categorical.shape[1], activation='softmax')
])
bilstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Simpan semua model
models = {
    "ANN": (ann, X_train_ann, X_test_ann),
    "DNN": (dnn, X_train_ann, X_test_ann),
    "MLP": (mlp, X_train_ann, X_test_ann),
    "CNN": (cnn, X_train_seq, X_test_seq),
    "LSTM": (lstm, X_train_seq, X_test_seq),
    "BiLSTM": (bilstm, X_train_seq, X_test_seq)
}