In [1]:
import pandas as pd
import numpy as np
import os, glob

from pathlib import Path

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def load_dataset_from_structure(root_path):
    data = []
    for file in root_path.glob('*/*/*.csv'):
        try:
            df = pd.read_csv(file)

            category = file.parents[1].name  # DDoS, DoS, etc.
            attack = file.parent.name        # DDoS ICMP, DoS TCP, etc.
            label_class = 'Benign' if category.upper() == 'BENIGN' else 'Attack'

            df['category'] = category
            df['attack'] = attack
            df['class'] = label_class

            data.append(df)
        except Exception as e:
            print(f"[ERROR] Failed to read file {file}: {e}")
    return pd.concat(data, ignore_index=True)

# Load train
train_root = Path('../../Data/CICIoMT2024/train')
train_df = load_dataset_from_structure(train_root)

# Load test
test_root = Path('../../Data/CICIoMT2024/test')
test_df = load_dataset_from_structure(test_root)

# Cek ringkasan
print("Train set:", train_df.shape)
print(train_df[['category', 'attack', 'class']].value_counts())
print("\nTest set:", test_df.shape)
print(test_df[['category', 'attack', 'class']].value_counts())
# Gabungkan train dan test menjadi satu DataFrame
df = pd.concat([train_df, test_df], ignore_index=True)

ValueError: No objects to concatenate

In [None]:
# Label encoding
le = LabelEncoder()
y_encoded = le.fit_transform(df['attack'])


# Fitur numerik + scaling
X = df.drop(['class', 'category', 'attack'], axis=1, errors='ignore').select_dtypes(include=[np.number])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [None]:


models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),

    'AdaBoost': AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=3, class_weight='balanced'),
        n_estimators=100,
        algorithm='SAMME',
        random_state=42
    ),

    'XGBoost': XGBClassifier(
        use_label_encoder=False,
        objective='multi:softmax',
        eval_metric='mlogloss',
        num_class=len(le.classes_),
        random_state=42
    ),

    'LightGBM': LGBMClassifier(
        objective='multiclass',
        num_class=len(le.classes_),
        random_state=42
    ),

    'CatBoost': CatBoostClassifier(
        verbose=0,
        iterations=100,
        depth=6,
        loss_function='MultiClass',
        random_seed=42
    ),

    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
}


In [None]:
models['Voting Classifier'] = VotingClassifier(
    estimators=[
        ('rf', models['Random Forest']),
        ('xgb', models['XGBoost']),
        ('lgbm', models['LightGBM']),
    ],
    voting='hard'
)


In [None]:
results = {}

for name, model in models.items():
    print(f"\n===== {name} =====")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Inverse ke label asli
    y_pred_labels = le.inverse_transform(y_pred)
    y_true_labels = le.inverse_transform(y_test)

    # Classification Report
    report = classification_report(
        y_true_labels, y_pred_labels, digits=5,
        output_dict=True, zero_division=0
    )
    report_df = pd.DataFrame(report).transpose()
    print(report_df)

    # Confusion Matrix
    cm = confusion_matrix(y_true_labels, y_pred_labels, labels=le.classes_)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title(f"Confusion Matrix: {name}")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()

    results[name] = {
        'report': report_df,
        'confusion_matrix': cm
    }


In [None]:
performance_data = []

for name, result in results.items():
    report = result['report']
    accuracy = report.loc['accuracy']['precision'] if 'accuracy' in report.index else None
    macro = report.loc['macro avg']
    performance_data.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision (Macro Avg)': macro['precision'],
        'Recall (Macro Avg)': macro['recall'],
        'F1-Score (Macro Avg)': macro['f1-score']
    })

performance_df = pd.DataFrame(performance_data)
performance_df.sort_values(by='F1-Score (Macro Avg)', ascending=False, inplace=True)
performance_df.reset_index(drop=True, inplace=True)
display(performance_df)


In [None]:
bar_width = 0.2
index = np.arange(len(performance_df))

plt.figure(figsize=(12, 6))
plt.bar(index, performance_df['Accuracy'], bar_width, label='Accuracy')
plt.bar(index + bar_width, performance_df['Precision (Macro Avg)'], bar_width, label='Precision')
plt.bar(index + 2*bar_width, performance_df['Recall (Macro Avg)'], bar_width, label='Recall')
plt.bar(index + 3*bar_width, performance_df['F1-Score (Macro Avg)'], bar_width, label='F1-Score')

plt.xlabel('Model')
plt.ylabel('Score')
plt.title('Performance Comparison of Ensemble Models')
plt.xticks(index + 1.5 * bar_width, performance_df['Model'], rotation=45)
plt.ylim(0, 1.05)
plt.legend()
plt.tight_layout()
plt.grid(True, axis='y')
plt.show()
