# Data Preparation and EDA for IoT23 dataset

In [1]:
import os 
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
import csv
from numpy import array
from numpy import argmax
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler,LabelEncoder

In [2]:
data= pd.read_csv('preprocessed_iot23.csv', low_memory=False)

In [3]:
data.head()  

Unnamed: 0.1,Unnamed: 0,ts,id.orig_p,id.resp_p,duration,orig_bytes,resp_bytes,local_orig,local_resp,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,proto,conn_state,label
0,0,1536227000.0,17576.0,8081.0,3e-06,0.0,0.0,0,0,0.0,2.0,80.0,0.0,0.0,1,6,POHScan
1,1,1536227000.0,17576.0,8081.0,2e-06,0.0,0.0,0,0,0.0,2.0,80.0,0.0,0.0,1,6,POHScan
2,2,1536227000.0,17576.0,8081.0,2e-06,0.0,0.0,0,0,0.0,2.0,80.0,0.0,0.0,1,6,POHScan
3,3,1536227000.0,17576.0,8081.0,2e-06,0.0,0.0,0,0,0.0,2.0,80.0,0.0,0.0,1,6,POHScan
4,4,1536227000.0,17576.0,8081.0,2e-06,0.0,0.0,0,0,0.0,2.0,80.0,0.0,0.0,1,6,POHScan


In [4]:
del data['Unnamed: 0']

In [5]:
data.tail()

Unnamed: 0,ts,id.orig_p,id.resp_p,duration,orig_bytes,resp_bytes,local_orig,local_resp,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,proto,conn_state,label
1215130,1562165000.0,68.0,67.0,90.034713,3300.0,0.0,0,0,0.0,11.0,3608.0,0.0,0.0,2,6,Benign
1215131,1562165000.0,143.0,0.0,90.39997,340.0,0.0,0,0,0.0,9.0,844.0,0.0,0.0,0,0,Benign
1215132,1562165000.0,135.0,136.0,89.82403,72.0,0.0,0,0,0.0,3.0,216.0,0.0,0.0,0,0,Benign
1215133,1562165000.0,143.0,0.0,45.215915,200.0,0.0,0,0,0.0,8.0,648.0,0.0,0.0,0,0,Benign
1215134,1562165000.0,133.0,134.0,44.242223,0.0,0.0,0,0,0.0,2.0,96.0,0.0,0.0,0,0,Benign


In [6]:
data.shape

(1215135, 16)

In [7]:
print(data.columns)

Index(['ts', 'id.orig_p', 'id.resp_p', 'duration', 'orig_bytes', 'resp_bytes',
       'local_orig', 'local_resp', 'missed_bytes', 'orig_pkts',
       'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'proto', 'conn_state',
       'label'],
      dtype='object')


In [8]:
#we drop port numbers because they can cause overfitting of the model
df= data.drop(columns=['id.orig_p','id.resp_p'])

In [9]:
#separate the X variables
X=df.iloc[:,:-1]

In [10]:
#separate labels (y)
y=data['label']

In [11]:
X.describe()

Unnamed: 0,ts,duration,orig_bytes,resp_bytes,local_orig,local_resp,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,proto,conn_state
count,1215135.0,1215135.0,1215135.0,1215135.0,1215135.0,1215135.0,1215135.0,1215135.0,1215135.0,1215135.0,1215135.0,1215135.0,1215135.0
mean,1544026000.0,0.4160833,1744.161,284.5327,0.0,0.0,0.01682529,209.7919,7640.348,0.2780588,299.2224,1.094919,4.509022
std,15862410.0,98.90378,992844.9,305301.4,0.0,0.0,8.52265,78829.27,2652670.0,217.4025,317190.9,0.314191,2.660314
min,1525880000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1533077000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,40.0,0.0,0.0,1.0,0.0
50%,1536227000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,40.0,0.0,0.0,1.0,6.0
75%,1545398000.0,2e-06,0.0,0.0,0.0,0.0,0.0,2.0,80.0,0.0,0.0,1.0,6.0
max,1569018000.0,78840.33,962347200.0,336516400.0,0.0,0.0,7363.0,66027350.0,1914793000.0,239484.0,349618700.0,2.0,12.0


In [12]:
X.isnull().sum()

ts               0
duration         0
orig_bytes       0
resp_bytes       0
local_orig       0
local_resp       0
missed_bytes     0
orig_pkts        0
orig_ip_bytes    0
resp_pkts        0
resp_ip_bytes    0
proto            0
conn_state       0
dtype: int64

In [13]:
X = np.asarray(X).astype(np.float32)

In [14]:
y

0          POHScan
1          POHScan
2          POHScan
3          POHScan
4          POHScan
            ...   
1215130     Benign
1215131     Benign
1215132     Benign
1215133     Benign
1215134     Benign
Name: label, Length: 1215135, dtype: object

In [15]:
y.value_counts()

label
Okiru      374621
DDoS       374204
POHScan    313027
Benign     145058
C&C          8225
Name: count, dtype: int64

In [16]:
le=LabelEncoder()

In [17]:
y=le.fit_transform(y)

In [18]:
y

array([4, 4, 4, ..., 0, 0, 0])

In [19]:
#the labels are encoded in alphabetic order
#0---->Benign
#1---->Cnc
#2---->DDoS
#3---->Okiru
#4---->POHScan


In [20]:
le.inverse_transform(y)

array(['POHScan', 'POHScan', 'POHScan', ..., 'Benign', 'Benign', 'Benign'],
      dtype=object)

In [21]:
y.shape

(1215135,)

=======================================================================

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import auc,roc_auc_score,roc_curve,precision_score,recall_score,f1_score
from sklearn.model_selection import train_test_split
import time as timer

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=124, test_size=0.20, shuffle=True)

In [24]:
std_scale = preprocessing.StandardScaler().fit(X_train)

In [25]:
X_train_std = std_scale.transform(X_train)

In [26]:
X_test_std  = std_scale.transform(X_test)

In [27]:
X_train.shape

(972108, 13)

# ADABoost

In [28]:
import time
from sklearn.ensemble import AdaBoostClassifier

start_time = time.time()
adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train_std, Y_train)
end_time = time.time()

elapsed_time = end_time - start_time
print("Finished training within {:.2f} seconds".format(elapsed_time))


Finished training within 47.92 seconds


In [29]:

# Predict with AdaBoost model
y_adaboost = adaboost_model.predict(X_test_std)
y_adaboost_prob = adaboost_model.predict_proba(X_test_std)

# Classification report for AdaBoost
print("Classification report for AdaBoost: \n{}".format(classification_report(Y_test, y_adaboost)))

# Confusion matrix for AdaBoost
print("Confusion matrix for AdaBoost: \n{}".format(confusion_matrix(Y_test, y_adaboost)))

# Accuracy score for AdaBoost
accuracy_adaboost = accuracy_score(Y_test, y_adaboost)
print("Accuracy score for AdaBoost: {:.4f}".format(accuracy_adaboost))

# Precision score for AdaBoost
prec_adaboost = precision_score(Y_test, y_adaboost,  average='weighted')
print("Precision score for AdaBoost: {:.4f}".format(prec_adaboost))

# Recall score for AdaBoost
rec_adaboost = recall_score(Y_test, y_adaboost,  average='weighted')
print("Recall score for AdaBoost: {:.4f}".format(rec_adaboost))

# F1 score for AdaBoost
f1_adaboost = f1_score(Y_test, y_adaboost,  average='weighted')
print("F1 score for AdaBoost: {:.4f}".format(f1_adaboost))

# Calculate sensitivity, specificity, and AUC for AdaBoost
sens_adaboost, spec_adaboost = calc_sens_spec(Y_test, y_adaboost)
adaboost_fpr, adaboost_tpr, _ = roc_curve(Y_test, y_adaboost_prob[:,1])
auc_adaboost = roc_auc_score(Y_test, y_adaboost_prob[:,1])
print("Sensitivity score for AdaBoost: {:.2f}".format(sens_adaboost))
print("Specificity score for AdaBoost: {:.2f}".format(spec_adaboost))
print("AUC score for AdaBoost: {:.2f}".format(auc_adaboost))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification report for AdaBoost: 
              precision    recall  f1-score   support

           0       0.30      0.90      0.45     28695
           1       0.00      0.00      0.00      1626
           2       0.02      0.00      0.00     74607
           3       1.00      0.80      0.89     75258
           4       0.64      0.98      0.78     62841

    accuracy                           0.61    243027
   macro avg       0.39      0.54      0.42    243027
weighted avg       0.52      0.61      0.53    243027

Confusion matrix for AdaBoost: 
[[25801     0   254     2  2638]
 [    1     0     0     0  1625]
 [59491     0     6     0 15110]
 [   10     0     3 60206 15039]
 [ 1179     0     0     0 61662]]
Accuracy score for AdaBoost: 0.6076
Precision score for AdaBoost: 0.5178


  _warn_prf(average, modifier, msg_start, len(result))


Recall score for AdaBoost: 0.6076
F1 score for AdaBoost: 0.5289


NameError: name 'calc_sens_spec' is not defined

In [30]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import time

# Assuming you have defined X_train_std, Y_train, X_test_std, Y_test
# Also, assuming you have a LabelEncoder named le

# Create a base classifier (you can use any classifier here, e.g., DecisionTreeClassifier)
base_classifier = DecisionTreeClassifier(max_depth=1)

# Create the AdaBoostClassifier
adaboost_model = AdaBoostClassifier(base_classifier, n_estimators=50, random_state=42)

# Training the AdaBoostClassifier
start_time_adaboost = time.time()
adaboost_model.fit(X_train_std, Y_train)
end_time_adaboost = time.time()

elapsed_time_adaboost = end_time_adaboost - start_time_adaboost
print("Finished training AdaBoost Classifier within {:.2f} seconds".format(elapsed_time_adaboost))

# Prediction with the AdaBoost Classifier
y_adaboost = adaboost_model.predict(X_test_std)

# Calculate the confusion matrix for AdaBoost predictions
cf_adaboost = confusion_matrix(Y_test, y_adaboost)

# Calculate the accuracy for each class
accuracies_adaboost = cf_adaboost.diagonal() / cf_adaboost.sum(axis=1)

# Print the accuracy for each class
classes_adaboost = le.classes_  # Assuming le is your LabelEncoder
for class_label, accuracy in zip(classes_adaboost, accuracies_adaboost):
    print(f"Accuracy for class {class_label}: {accuracy:.4f}")


Finished training AdaBoost Classifier within 46.19 seconds
Accuracy for class Benign: 0.8991
Accuracy for class C&C: 0.0000
Accuracy for class DDoS: 0.0001
Accuracy for class Okiru: 0.8000
Accuracy for class POHScan: 0.9812


# Catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
start = timer.time()

# Create a CatBoostClassifier
catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, random_seed=124, verbose=200)

# Fit the model to the training data
catboost_model.fit(X_train_std, Y_train)

end = timer.time()
print("Finished training within {:.2f} seconds".format(end - start))


In [None]:
# Predict the labels for the test set using the trained CatBoost model
y_catboost = catboost_model.predict(X_test_std)

# Predict the class probabilities for the test set
y_catboost_prob = catboost_model.predict_proba(X_test_std)

# The 'y_catboost' variable contains the predicted labels, and 'y_catboost_prob' contains the class probabilities.


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Calculate predictions using the CatBoost model
y_catboost = catboost_model.predict(X_test_std)

# Calculate the predicted class probabilities
y_catboost_prob = catboost_model.predict_proba(X_test_std)

# Convert predicted labels to integers
y_catboost = y_catboost.astype(int)

# Calculate and print the classification report
classification_rep = classification_report(Y_test, y_catboost, target_names=le.classes_)
print("Classification report for CatBoost:\n", classification_rep)

# Calculate and print the confusion matrix
confusion_mtx = confusion_matrix(Y_test, y_catboost)
print("Confusion matrix for CatBoost:\n", confusion_mtx)

# Calculate and print the accuracy score
accuracy = accuracy_score(Y_test, y_catboost)
print("Accuracy score for CatBoost: {:.4f}".format(accuracy))

# Calculate and print precision, recall, and F1 scores
prec_catboost = precision_score(Y_test, y_catboost, average='weighted')
rec_catboost = recall_score(Y_test, y_catboost, average='weighted')
f1_catboost = f1_score(Y_test, y_catboost, average='weighted')
print("Precision score for CatBoost: {:.4f}".format(prec_catboost))

recall_catboost = recall_score(Y_test, y_catboost, average='weighted')
print("Recall score for CatBoost: {:.4f}".format(recall_catboost))

# Calculate and print F1 score
f1_catboost = f1_score(Y_test, y_catboost, average='weighted')
print("F1 score for CatBoost: {:.4f}".format(f1_catboost))

In [None]:
from sklearn.metrics import accuracy_score

# Calculate predictions using the CatBoost model
y_catboost = catboost_model.predict(X_test_std)

# Calculate the confusion matrix for CatBoost predictions
cf_catboost = confusion_matrix(Y_test, y_catboost)

# Calculate the accuracy for each class
accuracies = cf_catboost.diagonal() / cf_catboost.sum(axis=1)

# Now, 'accuracies' saves the accuracy for each class

# Print the accuracy for each class
classes = le.classes_  # Assuming le is your LabelEncoder
for class_label, accuracy in zip(classes, accuracies):
    print(f"Accuracy for class {class_label}: {accuracy:.4f}")


In [None]:
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Calculate predictions using the CatBoost model
y_catboost = catboost_model.predict(X_test_std)

# Calculate the confusion matrix for CatBoost predictions
cf_catboost = confusion_matrix(Y_test, y_catboost)

# Calculate the accuracy for each class
accuracies = cf_catboost.diagonal() / cf_catboost.sum(axis=1)

# Get class labels from the LabelEncoder
classes = le.classes_

# Create a pie chart to visualize the accuracy for each class
plt.figure(figsize=(8, 8))
plt.pie(accuracies, labels=classes, autopct='%1.1f%%', startangle=140)
plt.title('Accuracy for Each Class')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.savefig(r'C:\Users\Junaid Abbas\Desktop\Research Project\Iot_23\pictures\accuracy_pie_chart.png')

# Show the pie chart
plt.show()


# FNNs

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense

# Define the FNN model
model = keras.Sequential([
    Dense(64, activation='relu', input_shape=(X_train_std.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the FNN model
start = timer.time()
model.fit(X_train_std, Y_train, epochs=10, batch_size=32, verbose=1)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end - start))


In [None]:
# Predict with the FNN model
y_fnn_prob = model.predict(X_test_std)

# Convert predicted probabilities to binary class 
y_fnn = (y_fnn_prob > 0.5).astype(int)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Predict with the FNN model
y_fnn_prob = model.predict(X_test_std)

# Convert predicted probabilities to binary class labels (0 or 1)
y_fnn = (y_fnn_prob > 0.5).astype(int)

# Classification report, confusion matrix, and accuracy score for FNN
print("Classification report for FNN: \n{}".format(classification_report(Y_test, y_fnn)))
print("Confusion matrix for FNN: \n{}".format(confusion_matrix(Y_test, y_fnn)))
print("Accuracy score for FNN: {:.4f}".format(accuracy_score(Y_test, y_fnn)))

# Calculate precision, recall, and F1 scores for FNN
prec_fnn = precision_score(Y_test, y_fnn, average='weighted')
rec_fnn = recall_score(Y_test, y_fnn, average='weighted')
f1_fnn = f1_score(Y_test, y_fnn, average='weighted')
print("Precision score for FNN: {:.4f}".format(prec_fnn))
print("Recall score for FNN: {:.4f}".format(rec_fnn))
print("F1 score for FNN: {:.4f}".format(f1_fnn))



In [None]:
import numpy as np

# Calculate the confusion matrix for FNN
cf = confusion_matrix(Y_test, y_fnn)

# Calculate the accuracy for each class
class_accuracy = np.diag(cf) / cf.sum(axis=1)

# Print the array of class accuracies
print(class_accuracy)

In [None]:
# Get the class labels
class_labels = le.classes_  # Assuming le is the LabelEncoder you used

# Create a variable to store class accuracies
class_accuracies = {}

for label, accuracy in zip(class_labels, class_accuracy):
    class_accuracies[label] = accuracy

# Print the class accuracies
for label, accuracy in class_accuracies.items():
    print(f"Class: {label}, Accuracy: {accuracy:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Assuming you already have the class_accuracies dictionary

# Get class labels and accuracies
class_labels = list(class_accuracies.keys())
accuracies = list(class_accuracies.values())

# Create a pie chart to visualize the accuracies for each class
plt.figure(figsize=(8, 8))
plt.pie(accuracies, labels=class_labels, autopct='%1.1f%%', startangle=140)
plt.title('Accuracy for Each Class')
plt.axis('equal')  # Equal aspect ratio ensures that the pie chart is circular.

# Save the pie chart as an image
plt.savefig(r'C:\Users\Junaid Abbas\Desktop\Research Project\Iot_23\pictures\class_accuracy_pie_chart.png', bbox_inches='tight', dpi=300)

# Show the pie chart
plt.show()


# Linear Discriminant Analysis (LDA) 

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Create an LDA model
lda_model = LinearDiscriminantAnalysis()

start = timer.time()

# Fit the LDA model to the training data
lda_model.fit(X_train_std, Y_train)

end = timer.time()
print("Finished training within {:.2f} seconds".format(end - start))


In [None]:
# Predict using the LDA model
y_lda = lda_model.predict(X_test_std)

# Get class probabilities
y_lda_prob = lda_model.predict_proba(X_test_std)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Calculate predictions using the LDA model
y_lda = lda_model.predict(X_test_std)

# Generate a classification report
classification_rep_lda = classification_report(Y_test, y_lda)

# Calculate the confusion matrix for LDA predictions
cf_lda = confusion_matrix(Y_test, y_lda)

# Calculate the accuracy score for LDA predictions
accuracy_lda = accuracy_score(Y_test, y_lda)

# Calculate precision, recall, and F1 scores for LDA
prec_lda = precision_score(Y_test, y_lda, average='weighted')
rec_lda = recall_score(Y_test, y_lda, average='weighted')
f1_lda = f1_score(Y_test, y_lda, average='weighted')

print("Classification report for LDA: \n{}".format(classification_rep_lda))
print("Confusion matrix for LDA: \n{}".format(cf_lda))
print("Accuracy score for LDA: {:.4f}".format(accuracy_lda))
print("Precision score for LDA: {:.4f}".format(prec_lda))
print("Recall score for LDA: {:.4f}".format(rec_lda))
print("F1 score for LDA: {:.4f}".format(f1_lda))


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

# Perform LDA
lda = LinearDiscriminantAnalysis(n_components=2)
X_train_lda = lda.fit_transform(X_train_std, Y_train)
X_test_lda = lda.transform(X_test_std)

# Create a Logistic Regression classifier
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train_lda, Y_train)

# Make predictions on the test set
y_pred = logistic_regression.predict(X_test_lda)

# Calculate the confusion matrix
confusion = confusion_matrix(Y_test, y_pred)

# Calculate the accuracy for each class
class_accuracy = confusion.diagonal() / confusion.sum(axis=1)

# Print the array of class accuracies
print("Class Accuracies:", class_accuracy)


In [None]:
import matplotlib.pyplot as plt

# Assuming you have already calculated and stored class_accuracy as a list
# The class names are stored in the "class_names" list
class_names = ["Benign", "DDoS", "C&C", "POHScan", "Okiru"]
# Create a pie chart to visualize the accuracy for each class
plt.figure(figsize=(8, 8))
plt.pie(class_accuracy, labels=class_names, autopct='%1.1f%%', startangle=140)
plt.title('Accuracy for Each Class')
plt.axis('equal')  # Equal aspect ratio ensures that the pie chart is circular

plt.savefig(r'C:\Users\Junaid Abbas\Desktop\Research Project\Iot_23\pictures\LDA_class_accuracy_pie_chart.png', bbox_inches='tight', dpi=300)

# Show the pie chart
plt.show()


# Stacking Classifier 

In stacking, you train multiple models and use a meta-model to combine their predictions. Here, I'll demonstrate stacking with a base logistic regression model and a meta-model, typically a simpler model like another logistic regression or any other classifier. You can extend this example with more diverse base models based on your specific problem.

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time

# Assuming you have X_train_std, Y_train prepared

# Split the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_train_std, Y_train, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

base_model = LogisticRegression(solver='sag', max_iter=300, multi_class='multinomial')

meta_model = LogisticRegression()

# Create the stacking classifier
stacking_model = StackingClassifier(
    estimators=[('base_model', base_model)],
    final_estimator=meta_model,
    cv=5  # You can adjust the number of folds for cross-validation
)

# Train the stacking model
start = time.time()
stacking_model.fit(X_train_scaled, Y_train)
end = time.time()
print("Finished training stacking model within {:.2f} seconds".format(end - start))

# Predict on the validation set
y_val_pred = stacking_model.predict(X_val_scaled)

# Evaluate the stacking model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Classification report for Stacking: \n{}".format(classification_report(Y_val, y_val_pred)))
print("Confusion matrix for Stacking: \n{}".format(confusion_matrix(Y_val, y_val_pred)))
print("Accuracy score for Stacking: {:.4f}".format(accuracy_score(Y_val, y_val_pred)))


In [None]:
import numpy as np

# Assuming you have the confusion matrix for stacking model cf_stacking
cf_stacking = np.array([[20987, 0, 102, 149, 2082],
                        [1, 0, 0, 0, 1287],
                        [0, 0, 47787, 0, 11912],
                        [0, 0, 9, 48048, 11959],
                        [304, 0, 0, 0, 49795]])

# Calculate accuracy for each class in stacking model
class_accuracy_stacking = cf_stacking.diagonal() / cf_stacking.sum(axis=1)

# Print accuracy for each class in stacking model
for i, acc in enumerate(class_accuracy_stacking):
    print(f'Accuracy for Class {i}: {acc:.4f}')


# Bagging Classifier

In [None]:
from sklearn.ensemble import BaggingClassifier

# Create a base Gaussian Naive Bayes model
gnb_model = GaussianNB()

# Create a BaggingClassifier using the Gaussian Naive Bayes model
bagging_model = BaggingClassifier(base_estimator=gnb_model, n_estimators=10, random_state=42)

# Train the BaggingClassifier
start = timer.time()
bagging_model.fit(X_train_std, Y_train)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end - start))

# Predictions
bagging_predictions = bagging_model.predict(X_test_std)

# Model evaluation
print("\nBagging Classifier with Gaussian Naive Bayes:")
print("Accuracy:", accuracy_score(Y_test, bagging_predictions))
print("Classification Report:\n", classification_report(Y_test, bagging_predictions))


In [None]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=124, test_size=0.20, shuffle=True)

# Standardize the features
std_scale = preprocessing.StandardScaler().fit(X_train)
X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)
print("X_train_std shape:", X_train_std.shape)
print("Y_train shape:", Y_train.shape)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

# Number of base CatBoost models in the ensemble
num_models = 4  # You can adjust this number

# List to store individual CatBoost models
catboost_models = []

# Train individual CatBoost models
for i in range(num_models):
    indices = np.random.choice(len(X_train_std), len(X_train_std), replace=True)
    catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, random_seed=124, verbose=200)
    catboost_model.fit(X_train_std[indices], Y_train[indices])
    catboost_models.append(catboost_model)

# Make predictions using individual models
individual_predictions = [model.predict(X_test_std) for model in catboost_models]

# Create an ensemble prediction by majority voting
ensemble_predictions = np.round(np.mean(individual_predictions, axis=0))

# Calculate the confusion matrix for ensemble predictions
cf_ensemble = confusion_matrix(Y_test, ensemble_predictions)

# Calculate the accuracy for each class
accuracies_ensemble = cf_ensemble.diagonal() / cf_ensemble.sum(axis=1)

# Print the accuracy for each class
classes = le.classes_  # Assuming le is your LabelEncoder
for class_label, accuracy in zip(classes, accuracies_ensemble):
    print(f"Accuracy for class {class_label}: {accuracy:.4f}")
