Data extraction and aggregation

In [None]:
import pandas as pd
import glob
import os
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

path = r'./UNSW-NB15/'

features = pd.read_csv("./UNSW-NB15/NUSW-NB15_features.csv", encoding='cp1252')

data = pd.concat([
    pd.read_csv(path + "UNSW-NB15_1.csv", low_memory=False, names=features.Name),
    pd.read_csv(path + "UNSW-NB15_2.csv", low_memory=False, names=features.Name),
    pd.read_csv(path + "UNSW-NB15_3.csv", low_memory=False, names=features.Name),
    pd.read_csv(path + "UNSW-NB15_4.csv", low_memory=False, names=features.Name)
])



In [None]:
data.info()

Let's try to analyze the class division

In [None]:
data_summary_attack = pd.DataFrame(data.Label)
data_summary_attack = data_summary_attack.reset_index().groupby("Label").count()
data_summary_attack.columns.values[0] = "Occurrencies"
data_summary_attack["Percentage"] = data_summary_attack.Occurrencies/data_summary_attack.sum()[0]
data_summary_attack

We have a very unbalanced dataset. Now let's look at the attack classes

In [None]:
data_summary_classes = pd.DataFrame(data.attack_cat)
data_summary_classes.dropna(how="all")
data_summary_classes = data_summary_classes.reset_index().groupby("attack_cat").count()
data_summary_classes.columns.values[0] = "Occurrencies"
data_summary_classes["Percentage"] = data_summary_classes.Occurrencies/data_summary_classes.sum()[0]
data_summary_classes

Even the attack categories are highly unbalanced. Let's try to plot this data.
Let's start with the correlation matrix.

In [None]:
# data.corr()
f,ax = plt.subplots(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.2f',ax=ax)
plt.show()

Histogram of the data. I'll plot the most relevant features because my computer breaks.

In [None]:
# for att in data.columns[:-2]:
#     plt.figure()
#     plt.hist(data[att].astype(str))
#     plt.ylabel('Occurrences')
#     plt.xlabel(att)
#     plt.title(f'histogram of {att} attribute')
plt_data = data[['proto', 'service', 'attack_cat']].copy()
for att in plt_data.columns:
    plt.figure()
    plt.hist(plt_data[att].astype(str), bins = len(plt_data[att].value_counts()))
    plt.ylabel('Occurrences')
    plt.xlabel(att)
    plt.title(f'histogram of {att} attribute')

In [None]:
data_summary_protocol = pd.DataFrame(data.proto)
data_summary_protocol.dropna(how="all")
data_summary_protocol = data_summary_protocol.reset_index().groupby("proto").count()
data_summary_protocol.columns.values[0] = "Occurrencies"
data_summary_protocol["Percentage"] = data_summary_protocol.Occurrencies/data_summary_protocol.sum()[0]
data_summary_protocol.sort_values('Occurrencies', ascending=False)

We can see that the protocol occurrencies are mainly composed by tcp and udp, let's focus only on lines corresponding to the attack label.

In [None]:
data_filtered = data[(data.Label == 1)]
plt.figure()
plt.hist(data_filtered.proto, bins = len(data_filtered.proto.value_counts()))
plt.ylabel('Occurrences')
plt.xlabel('Protocols')
data_summary_protocol_attack = pd.DataFrame(data_filtered.proto)
data_summary_protocol_attack.dropna(how="all")
data_summary_protocol_attack = data_summary_protocol_attack.reset_index().groupby("proto").count()
data_summary_protocol_attack.columns.values[0] = "Occurrencies"
data_summary_protocol_attack["Percentage"] = data_summary_protocol_attack.Occurrencies/data_summary_protocol_attack.sum()[0]
data_summary_protocol_attack.sort_values('Occurrencies', ascending=False)

We can see that the protocol is not essential since the occurencies mantain the numbers even considering only the attacks (so maybe we could think of removing it)