In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("azalhowaide/iot-dataset-for-intrusion-detection-systems-ids")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/azalhowaide/iot-dataset-for-intrusion-detection-systems-ids?dataset_version_number=2...


100%|██████████| 525M/525M [00:13<00:00, 41.4MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/azalhowaide/iot-dataset-for-intrusion-detection-systems-ids/versions/2


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os

# Load dataset
# Replace 'path/to/your/csv_file.csv' with the actual path to your CSV file
csv_file_path = os.path.join(path, 'BoTNeTIoT-L01-v2.csv')
df = pd.read_csv(csv_file_path)

# Preprocessing: Example assumes 'Label' column for binary and 'Category' for multiclass
df = df.dropna()

# Separate features and labels
X = df.drop(columns=['Attack', 'Attack_subType'])  # Features
y_binary = df['Attack'].map({'Normal': 0, 'gafgyt': 1, 'mirai': 1})  # Binary labels - Assuming 'Attack' is the binary label
y_category = df['Attack_subType']  # Multiclass labels - Assuming 'Attack_subType' is the multiclass label

# Drop rows with NaN in y_binary
X = X[y_binary.notna()]
y_binary = y_binary.dropna()

# Identify categorical columns for one-hot encoding
categorical_features = X.select_dtypes(include=['object']).columns

# Create a column transformer for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='passthrough')

# Whale Optimization Algorithm (simplified)
def whale_optimize(X, y, n_iter=5, n_whales=5):
    best_params = {'n_estimators': 30, 'max_depth': 5}
    best_score = 0
    for _ in range(n_iter):
        for _ in range(n_whales):
            params = {
                'n_estimators': np.random.randint(20, 50),
                'max_depth': np.random.randint(2, 7)
            }
            # Create a pipeline with preprocessing and the model
            clf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                           ('classifier', RandomForestClassifier(**params))])

            clf_pipeline.fit(X, y)
            score = clf_pipeline.score(X, y)
            if score > best_score:
                best_score = score
                best_params = params
    return best_params

# Split data
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X, y_binary, test_size=0.3, random_state=42)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_category, test_size=0.3, random_state=42)

# Optimize and train binary classifier
params_b = whale_optimize(X_train_b, y_train_b)
clf_b_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier(**params_b))])
clf_b_pipeline.fit(X_train_b, y_train_b)
acc_b = accuracy_score(y_test_b, clf_b_pipeline.predict(X_test_b))

# Optimize and train category classifier
params_c = whale_optimize(X_train_c, y_train_c)
clf_c_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier(**params_c))])
clf_c_pipeline.fit(X_train_c, y_train_c)
acc_c = accuracy_score(y_test_c, clf_c_pipeline.predict(X_test_c))

# Plot comparison bar chart
plt.figure(figsize=(6, 5))
bars = plt.bar(['AC'], [acc_b * 100], color='blue', label='Binary Classification of IoTID20')
bars2 = plt.bar(['AC'], [acc_c * 100], color='orange', label='Category Classification of IoTID20', alpha=0.7)
plt.ylim(0, 100)
plt.ylabel('Percentage (%)')
plt.title('Classification Accuracy Comparison')
plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig('classification_comparison.png', dpi=300)
plt.show()

print(f"Binary Accuracy: {acc_b:.2%}")
print(f"Category Accuracy: {acc_c:.2%}")

In [None]:
print(df['Attack'].unique())
print(y_binary.value_counts(dropna=False))

['gafgyt' 'mirai' 'Normal']
Attack
NaN    6506674
0.0     555932
Name: count, dtype: int64


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
import joblib  # For potential parallelization

# Load dataset (replace with your actual path)
df = pd.read_csv('/kaggle/input/iot-dataset-for-intrusion-detection-systems-ids/BoTNeTIoT-L01-v2.csv').dropna()

# Preprocessing - Convert to categorical if not already
X = df.drop(columns=['Attack', 'Attack_subType'])
y_binary = df['Attack'].map({'Normal': 0, 'gafgyt': 1, 'mirai': 1})
y_category = df['Attack_subType']

# Fast OneHot Encoding setup
categorical_cols = X.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
    [('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)],
    remainder='passthrough'
)

# Simplified Whale Optimization (Reduced iterations)
def quick_whale_optimize(X, y, n_iter=3, n_whales=3):
    best_params = {'n_estimators': 30, 'max_depth': 5}
    best_score = 0
    for _ in range(n_iter):
        params = {
            'n_estimators': np.random.randint(20, 50),
            'max_depth': np.random.randint(3, 10)
        }
        model = make_pipeline(preprocessor,
                             RandomForestClassifier(**params, n_jobs=-1))  # Parallel trees
        model.fit(X, y)
        score = model.score(X, y)
        if score > best_score:
            best_score, best_params = score, params
    return best_params

# Train-test split
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X, y_binary, test_size=0.2, random_state=42)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_category, test_size=0.2, random_state=42)

# Train models with optimized params
binary_clf = make_pipeline(preprocessor,
                          RandomForestClassifier(**quick_whale_optimize(X_train_b, y_train_b),
                                               n_jobs=-1))
binary_clf.fit(X_train_b, y_train_b)

category_clf = make_pipeline(preprocessor,
                            RandomForestClassifier(**quick_whale_optimize(X_train_c, y_train_c),
                                                 n_jobs=-1))
category_clf.fit(X_train_c, y_train_c)

# Evaluate
binary_acc = accuracy_score(y_test_b, binary_clf.predict(X_test_b))
category_acc = accuracy_score(y_test_c, category_clf.predict(X_test_c))

# Visualization
plt.figure(figsize=(8, 4))
plt.bar(['Binary', 'Multiclass'], [binary_acc*100, category_acc*100],
        color=['skyblue', 'salmon'])
plt.ylabel('Accuracy (%)')
plt.title('Model Performance Comparison')
plt.ylim(0, 100)
for i, v in enumerate([binary_acc*100, category_acc*100]):
    plt.text(i, v+1, f"{v:.1f}%", ha='center')
plt.tight_layout()
plt.savefig('quick_comparison.png', dpi=450)
plt.show()

print(f"Binary Accuracy: {binary_acc:.2%} | Multiclass Accuracy: {category_acc:.2%}")