In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [None]:
df = pd.read_csv('EdgeIIoT Lab + Test.csv', low_memory=False)

In [None]:
print(df.columns)
print(df.shape)

In [None]:
print(df['Attack_type'].value_counts())

In [None]:
from sklearn.utils import shuffle
drop_columns = ["frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4","arp.dst.proto_ipv4", 
                "http.file_data","http.request.full_uri","icmp.transmit_timestamp",
                "http.request.uri.query", "tcp.options","tcp.payload","tcp.srcport",
                "tcp.dstport", "udp.port", "mqtt.msg"]

df.drop(drop_columns, axis=1, inplace=True)
df.dropna(axis=0, how='any', inplace=True)
df.drop_duplicates(subset=None, keep="first", inplace=True)
print(df['Attack_type'].value_counts())

In [None]:
print(df['Attack_label'].value_counts())

In [None]:
df = shuffle(df)
assert df.isnull().sum().sum() == 0

In [None]:
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)
    
encode_text_dummy(df,'http.request.method')
encode_text_dummy(df,'http.referer')
encode_text_dummy(df,"http.request.version")
encode_text_dummy(df,"dns.qry.name.len")
encode_text_dummy(df,"mqtt.conack.flags")
encode_text_dummy(df,"mqtt.protoname")
encode_text_dummy(df,"mqtt.topic")

In [None]:
df.to_csv('preprocessed_dataset.csv', encoding='utf-8')

In [None]:
df = pd.read_csv('preprocessed_dataset.csv', low_memory=False) 

In [None]:
attacks = {'Normal': 0 ,'Backdoor' :1, 'DDoS_HTTP':2,  'DDoS_ICMP':3, 'DDoS_TCP':4, 'DDoS_UDP':5, 
           'Fingerprinting':6, 'MITM':7, 'Password':8, 'Port_Scanning':9, 'Ransomware':10, 
           'SQL_injection':11, 'Uploading':12, 'Vulnerability_scanner':13, 'XSS':14}

df['Attack_type'] = df['Attack_type'].map(attacks)

In [None]:
feat_cols = list(df.columns)
label_col = "Attack_type"
feat_cols.remove(label_col)

In [None]:
empty_cols = [col for col in df.columns if df[col].isnull().all()]

In [None]:
skip_list = ["icmp.unused", "http.tls_port", "dns.qry.type", "mqtt.msg_decoded_as"]

In [None]:
df[skip_list[3]].value_counts()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
fig, (ax1, ax2)  = plt.subplots(nrows=1, ncols=2, figsize=(12,8))
explode = list((np.array(list(df[label_col].dropna().value_counts()))/sum(list(df[label_col].dropna().value_counts())))[::-1])[:]
labels = list(df[label_col].dropna().unique())[:]
sizes = df[label_col].value_counts()[:]

ax2.pie(sizes,  explode=explode, startangle=60, labels=labels, autopct='%1.0f%%', pctdistance=0.8)
ax2.add_artist(plt.Circle((0,0),0.4,fc='white'))
sns.countplot(y=label_col, data=df, ax=ax1)
ax1.set_title("Количество атак по типам")
ax2.set_title("Процент по типам атак")
plt.show()

In [None]:
attack_type = df.pop('Attack_type')
attack_label = df.pop('Attack_label')

In [None]:
attack_counts = attack_type.map({v: k for k, v in attacks.items()}).value_counts()

for attack, code in attacks.items():
    count = attack_counts[attack] if attack in attack_counts else 0
    print(f'{code} - {attack} \t\t\t- {count}')

In [None]:
#from sklearn.preprocessing import StandardScaler

#X = StandardScaler().fit_transform(df.values)
X = df.values
y = attack_type.values

In [None]:
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X, y)

In [None]:
from sklearn.model_selection import train_test_split
X_train_us, X_test_us, y_train_us, y_test_us = train_test_split(X_res, y_res, stratify=y_res, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

del X
del X_res
del y
del y_res

print ("Train:", X_train.shape, y_train.shape)
print ("Test:", X_test.shape, y_test.shape)
print ("Undersampling Train:", X_train_us.shape, y_train_us.shape)
print ("Undersampling Test:", X_test_us.shape, y_test_us.shape)

In [None]:
# In the X_train data, count the number attacks of each type
unique, counts = np.unique(y_train_us, return_counts=True)
print("Train: ", dict(zip(unique, counts)))

# Do the same for the test data
unique, counts = np.unique(y_test_us, return_counts=True)
print("Test: ", dict(zip(unique, counts)))

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train =  label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
y_train_us =  label_encoder.fit_transform(y_train_us)
y_test_us = label_encoder.transform(y_test_us)

In [None]:
label_encoder.classes_

In [None]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
X_train =  min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)
X_train_us =  min_max_scaler.fit_transform(X_train_us)
X_test_us = min_max_scaler.transform(X_test_us)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(X_train_us.shape)
print(X_test_us.shape)

In [None]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
X_train_us = X_train_us.reshape(X_train_us.shape[0], X_train_us.shape[1], 1)
X_test_us = X_test_us.reshape(X_test_us.shape[0], X_test_us.shape[1], 1)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(X_train_us.shape)
print(X_test_us.shape)

In [None]:
input_shape_us = X_train_us.shape[1:]
input_shape = X_train.shape[1:]

In [None]:
print(X_train_us.shape, X_test_us.shape, X_train.shape, X_test.shape)
print(input_shape_us)
print(input_shape)

In [None]:
num_classes_us = len(np.unique(y_train_us))
num_classes = len(np.unique(y_train))
num_classes

In [None]:
from  tensorflow.keras.utils import to_categorical 

y_train_us = to_categorical(y_train_us, num_classes=num_classes)
y_test_us = to_categorical(y_test_us, num_classes=num_classes)
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

In [None]:
print(y_train_us.shape, y_test_us.shape)
print(y_train.shape, y_test.shape)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

model = Sequential()
model.add(Conv1D(32, 3, activation='relu', input_shape=(input_shape)))
model.add(MaxPooling1D(2))

model.add(Conv1D(64, 3, activation='relu', input_shape=(input_shape)))
model.add(MaxPooling1D(2))
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

In [None]:
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Input, ZeroPadding1D
from tensorflow.keras.layers import MaxPooling1D, Add, AveragePooling1D
from tensorflow.keras.layers import Dense, BatchNormalization, Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.models import Model
from keras.initializers import glorot_uniform
import keras.backend as K
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

opt = Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss= tf.keras.metrics.categorical_crossentropy, 
                  metrics=['accuracy'])

In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

early_stopping = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=10)
lr_reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, mode="min", verbose=1, min_lr=0)
call_backs = [early_stopping, lr_reduce]
EPOCHS_US = 30
BATCH_SIZE = 256
call_backs = [early_stopping, lr_reduce]
history = model.fit(X_train_us, y_train_us, 
                    validation_data=(X_test_us, y_test_us),
                    validation_split=0.1,
                    epochs=EPOCHS_US, 
                    batch_size=BATCH_SIZE,
                    callbacks=call_backs,
                    verbose=1)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot();
print("Минимум функции потерь: {}".format(history_df['val_loss'].min()))

In [None]:
from matplotlib import pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
y_hat_us = model.predict(X_test_us)

In [None]:
# precision recall curve
from sklearn.metrics import precision_recall_curve
precision = dict()
recall = dict()
for i in range(num_classes):
    precision[i], recall[i], _ = precision_recall_curve(y_test_us[:, i],
                                                        y_hat_us[:, i])
    plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("precision vs. recall curve")
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_us[:, i], y_hat_us[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot of a ROC curve for a specific class
for i in range(num_classes):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
y_true_us = np.argmax(y_test_us, axis=1)
y_hat_us = np.argmax(y_hat_us, axis=1)

In [None]:
from sklearn.metrics import accuracy_score
def print_score(y_pred_us, y_real_us, labels=label_encoder):
    print("Точность: ", accuracy_score(y_real_us, y_pred_us))
print_score(y_hat_us, y_true_us, label_encoder)

In [None]:
y_rounded_us=np.argmax(y_test_us, axis=1)

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cnn_cm_us = confusion_matrix(y_rounded_us, y_hat_us)
disp = ConfusionMatrixDisplay(confusion_matrix=cnn_cm_us, display_labels=None)
disp.plot()
plt.show()

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

print("CNN Classification Report: ")
print(classification_report(y_rounded_us, y_hat_us))

In [None]:
EPOCHS = 12
BATCH_SIZE = 256
call_backs = [early_stopping, lr_reduce]
history = model.fit(X_train, y_train, 
                    validation_data=(X_test, y_test),
                    validation_split=0.1,
                    epochs=EPOCHS, 
                    batch_size=BATCH_SIZE,
                    callbacks=call_backs,
                    verbose=1)

In [None]:
y_hat = model.predict(X_test)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot();
print("Минимум функции потерь: {}".format(history_df['val_loss'].min()))

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
y_true = np.argmax(y_test, axis=1)
y_hat = np.argmax(y_hat, axis=1)

In [None]:
from sklearn.metrics import accuracy_score
def print_score(y_pred, y_real, labels=label_encoder):
    print("Точность: ", accuracy_score(y_real, y_pred))
print_score(y_hat, y_true, label_encoder)

In [None]:
y_rounded=np.argmax(y_test, axis=1)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cnn_cm = confusion_matrix(y_rounded, y_hat)
disp = ConfusionMatrixDisplay(confusion_matrix=cnn_cm, display_labels=None)
disp.plot()
plt.show()

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

print("CNN Classification Report: ")
print(classification_report(y_rounded, y_hat))