Code for creating a dataset from the MSCAD .csv file

In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os

We will create a dataframe object that holds all the data for each different label. Then we will do a training/validation split on each of these separate dataframes. Finally we combine all the dataframes together. This ensures that all the instances of each label is split well between the training and validation sets

In [21]:
# edit this to be the path to the MSCAD.csv file
csv_df = pd.read_csv("archive/MSCAD.csv")

val_split = 0.20 # what percent of the data will be validation data

# create a data frame for each label
brute_force_df = csv_df.loc[csv_df['Label'] == "Brute_Force"]
brute_force_df = brute_force_df.loc[:, brute_force_df.columns != "Label"]
brute_force_labels = (["Brute_Force"]*brute_force_df.shape[0])
http_ddos_df = csv_df.loc[csv_df['Label'] == "HTTP_DDoS"]
http_ddos_df = http_ddos_df.loc[:, http_ddos_df.columns != "Label"]
http_ddos_labels = (["HTTP_DDoS"]*http_ddos_df.shape[0])
icmp_flood_df = csv_df.loc[csv_df['Label'] == "ICMP_Flood"]
icmp_flood_df = icmp_flood_df.loc[:, icmp_flood_df.columns != "Label"]
icmp_flood_labels = ["ICMP_Flood"]*icmp_flood_df.shape[0]
port_scan_df = csv_df.loc[csv_df['Label'] == "Port_Scan"]
port_scan_df = port_scan_df.loc[:, port_scan_df.columns != "Label"]
port_scan_labels = (["Port_Scan"]*port_scan_df.shape[0])
web_crawling_df = csv_df.loc[csv_df['Label'] == "Web_Crwling"]
web_crawling_df = web_crawling_df.loc[:, web_crawling_df.columns != "Label"]
web_crawling_labels = ["Web_Crwling"]*web_crawling_df.shape[0]
normal_df = csv_df.loc[csv_df['Label'] == "Normal"]
normal_df = normal_df.loc[:, normal_df.columns != "Label"]
normal_labels = (["Normal"]*normal_df.shape[0])

features = [brute_force_df, http_ddos_df, icmp_flood_df, port_scan_df, web_crawling_df, normal_df]
labels = [brute_force_labels, http_ddos_labels, icmp_flood_labels, port_scan_labels, web_crawling_labels, normal_labels]


features_train_list = []
labels_train_list = []
features_val_list = []
labels_val_list = []
# for each label, split its dataframe into a training and test set
for idx in range(len(features)):
    features_train, features_val, labels_train, labels_val = train_test_split(features[idx], labels[idx], test_size=val_split)
    features_train_list.append(features_train)
    labels_train_list.append(labels_train)
    features_val_list.append(features_val)
    labels_val_list.append(labels_val)

features_train_df = None
labels_train_df = []
features_val_df = None
labels_val_df = []
# combine all the training dataframes together and all the validation dataframes together
for idx_label in range(len(features_train_list)):
    features_train_df = pd.concat([features_train_df, features_train_list[idx_label]])
    labels_train_df.extend(labels_train_list[idx_label])
    features_val_df = pd.concat([features_val_df, features_val_list[idx_label]])
    labels_val_df.extend(labels_val_list[idx_label])



In [22]:
# ensure that each label is represented in both the training and validation data
unique, counts = np.unique(labels_train_df, return_counts=True)
unique2, counts2 = np.unique(labels_val_df, return_counts=True)
print(f"training, \nunique: {unique} \ncounts: {counts}")
print(f"validation, \nunique: {unique2} \ncounts: {counts2}")

training, 
unique: ['Brute_Force' 'HTTP_DDoS' 'ICMP_Flood' 'Normal' 'Port_Scan' 'Web_Crwling'] 
counts: [70801   512    36 22801  8864    22]
validation, 
unique: ['Brute_Force' 'HTTP_DDoS' 'ICMP_Flood' 'Normal' 'Port_Scan' 'Web_Crwling'] 
counts: [17701   129     9  5701  2217     6]


Convert the labels from strings to numbers 
0 = "Brute_Force"
1 = "HTTP_DDoS"
2 = "ICMP_Flood"
3 = "Port_Scan"
4 = "Web_Crwling"
5 = "Normal"

In [23]:
features_train_np = np.array(features_train_df)

for idx in range(len(labels_train_df)):
    if labels_train_df[idx] == "Brute_Force":
        labels_train_df[idx] = 0
    elif labels_train_df[idx] == "HTTP_DDoS":
        labels_train_df[idx] = 1
    elif labels_train_df[idx] == "ICMP_Flood":
        labels_train_df[idx] = 2
    elif labels_train_df[idx] == "Port_Scan":
        labels_train_df[idx] = 3
    elif labels_train_df[idx] == "Web_Crwling":
        labels_train_df[idx] = 4
    elif labels_train_df[idx] == "Normal":
        labels_train_df[idx] = 5

labels_train_np = np.array(labels_train_df)

features_val_np = np.array(features_val_df)

for idx in range(len(labels_val_df)):
    if labels_val_df[idx] == "Brute_Force":
        labels_val_df[idx] = 0
    elif labels_val_df[idx] == "HTTP_DDoS":
        labels_val_df[idx] = 1
    elif labels_val_df[idx] == "ICMP_Flood":
        labels_val_df[idx] = 2
    elif labels_val_df[idx] == "Port_Scan":
        labels_val_df[idx] = 3
    elif labels_val_df[idx] == "Web_Crwling":
        labels_val_df[idx] = 4
    elif labels_val_df[idx] == "Normal":
        labels_val_df[idx] = 5



labels_val_np = np.array(labels_val_df)

In [24]:
print(features_train_np.shape)
print(labels_train_np.shape)
print(features_val_np.shape)
print(labels_val_np.shape)

(103036, 66)
(103036,)
(25763, 66)
(25763,)


Save the train and val features and labels as .npy files

In [27]:
train_path = "data/train"
val_path = "data/val"
if not os.path.exists(train_path):
    os.makedirs(train_path)
if not os.path.exists(val_path):
    os.makedirs(val_path)  

np.save(os.path.join(train_path, 'features.npy'), features_train_np)
np.save(os.path.join(train_path, 'labels.npy'), labels_train_np)
np.save(os.path.join(val_path, 'features.npy'), features_val_np)
np.save(os.path.join(val_path, 'labels.npy'), labels_val_np)