In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import learning_curve
import numpy as np

In [None]:
# Read dataset
df = pd.read_csv(r"Enter the path of the CICIDS2017 dataset")

# Filter out only 'BENIGN' and 'DoS' labels
df = df[df['Label'].isin(['BENIGN', 'DoS'])]

# Min-max normalization
numeric_features = df.dtypes[df.dtypes != 'object'].index
df[numeric_features] = (df[numeric_features] - df[numeric_features].min()) / (df[numeric_features].max() - df[numeric_features].min())

# Fill empty values by 0
df = df.fillna(0)

# Drop low variance features
df1 = df.drop('Label', axis = 1)
features = [i for i in df1.columns if df1[i].var() < 0.00001]
df = df.drop(features, axis = 1)

# Label encoding for 'Label' column
labelencoder = LabelEncoder()
df['Label'] = labelencoder.fit_transform(df['Label'])

# Split dataset into features (X) and labels (y)
X = df.drop(['Label'], axis = 1)
y = df['Label']

# Split data into 70% training, 20% validation, and 10% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.33, random_state = 0, stratify = y_test)

In [None]:
# Define individual classifiers
classifiers = [
    ('K-Nearest Neighbours', KNeighborsClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('XGB', xgb.XGBClassifier()),
    ('Extra Trees', ExtraTreesClassifier())
]

# Train and evaluate each individual classifier
results = []
for name, clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    cm_percentage = cm / cm.sum(axis = 1, keepdims = True) * 100

    results.append((name, clf, cm_percentage))

    print(f"Classifier: {name}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print('================================================')

In [None]:
# Create a voting classifier
voting_clf = VotingClassifier(estimators = classifiers, voting = 'hard')

# Train the voting classifier on training data
voting_clf.fit(X_train, y_train)

# Evaluate the voting classifier on testing data
y_pred_voting = voting_clf.predict(X_test)

accuracy_voting = accuracy_score(y_test, y_pred_voting)
precision_voting = precision_score(y_test, y_pred_voting)
recall_voting = recall_score(y_test, y_pred_voting)
cm_voting = confusion_matrix(y_test, y_pred_voting)
cm_voting_percentage = cm_voting / cm_voting.sum(axis = 1, keepdims = True) * 100

results.append(('Voting Classifier', voting_clf, cm_voting_percentage))

print("Voting Classifier:")
print(f"Accuracy: {accuracy_voting}")
print(f"Precision: {precision_voting}")
print(f"Recall: {recall_voting}")

In [None]:
# Plot confusion matrices for all classifiers
fig, axes = plt.subplots(nrows = 4, ncols = 2, figsize = (15, 20))
axes = axes.flatten()
for ax, (name, clf, cm_percentage) in zip(axes, results):
    sns.heatmap(cm_percentage, annot = True, fmt = '.2f', cmap = 'Blues', ax = ax, cbar = False,
                annot_kws = {"size": 10})
    # Annotate with percentage symbols
    for text in ax.texts:
        text.set_text(f"{float(text.get_text()):.2f}%")
    ax.set_title(f'Confusion Matrix for {name}')
    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')

plt.tight_layout()
plt.show()

In [None]:
# Plot learning curve for the voting classifier
plt.figure(figsize = (10, 6))
train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
    voting_clf, X_train, y_train, cv = 5, n_jobs = -1,
    train_sizes = np.linspace(.1, 1.0, 10), return_times = True,
    scoring = 'accuracy')
train_scores_mean = np.mean(train_scores, axis = 1)
train_scores_std = np.std(train_scores, axis = 1)
test_scores_mean = np.mean(test_scores, axis = 1)
test_scores_std = np.std(test_scores, axis = 1)
plt.grid()

plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha = 0.1,
                 color = "r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha = 0.1, color = "g")
plt.plot(train_sizes, train_scores_mean, 'o-', color = "r",
         label = "Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color = "g",
         label = "Cross-validation score")
plt.legend(loc = "best")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.title("Learning Curve (Voting Classifier)")
plt.show()