# Data extraction and aggregation

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

path = r'./UNSW-NB15/'

features = pd.read_csv(path + "NUSW-NB15_features.csv", encoding='cp1252')

data = pd.concat([
    pd.read_csv(path + "UNSW-NB15_1.csv", low_memory=False, names=features.Name),
    pd.read_csv(path + "UNSW-NB15_2.csv", low_memory=False, names=features.Name),
    pd.read_csv(path + "UNSW-NB15_3.csv", low_memory=False, names=features.Name),
    pd.read_csv(path + "UNSW-NB15_4.csv", low_memory=False, names=features.Name)
])

data.info()

# Features Conversion

## Features which are unusable in the real world are dropped

the features are : `srcip`, `sport`, `dstip`

ip and port can vary and they can be faked using vpns

In [None]:
data.drop(
	columns=["srcip", "sport", "dstip"], 
	inplace=True
)
data.info()

## Parsing `ct_ftp_cmd` to int

setting `-1` if the value was blank

In [None]:
data["ct_ftp_cmd"] = data["ct_ftp_cmd"].apply(lambda x: int(x) if x != " " else -1)
data.info()

## Converting Hex to int

In [None]:
data["dsport"] = data["dsport"].apply(lambda x: int(x, 16) if x != "-" else -1)
data.info()

## Setting `Nan` to `-1`

In [None]:
data["ct_flw_http_mthd"].fillna(-1,  inplace=True)
data["is_ftp_login"].fillna(-1,  inplace=True)


## Converting the nominal features

todo: create the enum for each features

In [None]:
nominal_features = ["proto", "state", "service"]

for nominal_feature in nominal_features:
		new_values, index = pd.factorize(data[nominal_feature])
		data[nominal_feature] = new_values

data.info()

# Data reduction

## Binary classication distributioon

In [None]:
data_summary_attack = pd.DataFrame(data.Label.value_counts())
data_summary_attack.columns.values[0] = "Occurrencies"

data_summary_attack["Percentage"] = data.Label.value_counts() / len(data)

data_summary_attack

We have a very unbalanced dataset. Now let's look at the attack classes

## Multiclass classificatin distribution

In [None]:

data_summary_classes = pd.DataFrame(data.attack_cat.value_counts())
data_summary_classes.columns.values[0] = "Occurrencies"

attack_len = sum(data.Label)
data_summary_classes["Percentage"] = data.attack_cat.value_counts() / attack_len

data_summary_classes

also the classes of attack are very unbalanced

## Reduce the number of classes of attack

Let's first see the clusterability of the data set

In [None]:
import pyclustertend as pyct

#Calculating the clusterability of the data frame in multiple samples

avg_list = []
for sampling_size in range(10,51,10):
    sample_list = []
    for i in range(1,10,1):
        data_blob = data.drop(columns=["Label", "attack_cat"]).sample(20_000)
        sample_list.append(pyct.hopkins(data_blob,sampling_size))
    print(f'Sampled {sampling_size}: {sum(sample_list) / len(sample_list)}')

The value is close to 0, so it'very clusterable

Data is clustered using k-means

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

to_plot = data.dropna(subset=["attack_cat"])

to_plot = to_plot.drop(columns=["Label", "attack_cat"]).sample(50_000)

#to_plot = StandardScaler().fit_transform(to_plot)

silhouette_list = []
inertia_list=[]
# f,axes = plt.subplots(2,4,figsize = (20,10))

for n_clusters in range(2,10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=10, init="k-means++")
    y_pred = kmeans.fit_predict(to_plot)

    # evaluate silhouette score
    silhouetteavg = silhouette_score(to_plot,y_pred)
    silhouette_list.append(silhouetteavg)

    # evaluate inertia
    inertia_list.append(kmeans.inertia_)

    # display clustered samples
    # axes[(n_clusters-2)//4][(n_clusters-2)%4].scatter(to_plot[:,0],to_plot[:,1], c = y_pred,alpha = 0.5)
    # axes[(n_clusters-2)//4][(n_clusters-2)%4].axis('equal')
    # axes[(n_clusters-2)//4][(n_clusters-2)%4].set_xlabel('Feature 1')
    # axes[(n_clusters-2)//4][(n_clusters-2)%4].set_ylabel('Feature 2')
    # axes[(n_clusters-2)//4][(n_clusters-2)%4].set_title(f'k={n_clusters} - Avg. Silhouette={silhouetteavg:.2} - n_iter = {kmeans.niter}' )

    # display clusters centroids
    centers = kmeans.cluster_centers_
    # axes[(n_clusters-2)//4][(n_clusters-2)%4].scatter(centers[:,0],centers[:,1], marker = 'x',c = 'r')

plt.tight_layout()

# plot silhouette and inertia trends w.r.t the number of clusters
fig, ax1 = plt.subplots()
ax1.set_xlabel('k')
ax1.set_ylabel('avg-silhouette', color='black')
ax1.plot(range(2,10),silhouette_list,'--ok')
ax1.tick_params(axis='y', labelcolor='black')
ax1.grid(axis='y')

ax2 = ax1.twinx()
ax2.set_ylabel('loss', color='red')
ax2.plot(range(2,10), inertia_list,'--or',alpha = 0.2)
ax2.tick_params(axis='y', labelcolor='red')

plt.tight_layout()  # otherwise the right y-label is slightly clipped

Using the silhouette method we have that the optimal number of clusters is 7(?) (4-ish and 6-ish for only the attack categories), let's print the info of each cluster

In [None]:
data_cluster = data.drop(columns="Label").sample(50_000)
data_cluster = data_cluster.fillna({'attack_cat': "NotAttack"})
data["attack_cat"] = data["attack_cat"].apply(lambda x: str(x))
data_cluster.info()


In [None]:
data_cluster = data.drop(columns="Label").sample(50_000)
data_cluster = data_cluster.fillna({'attack_cat': "NotAttack"})
data_cluster["attack_cat"] = data_cluster["attack_cat"].apply(lambda x: str(x))
cluster = KMeans(n_clusters = 7, random_state = 10, init = "k-means++")
data_cluster["cluster"] = cluster.fit_predict(data_cluster.drop(columns="attack_cat"))

#data_cluster.cluster

table = pd.pivot_table(data_cluster, index='attack_cat', columns='cluster', values='attack_cat', aggfunc=lambda x: 'count')
print(table)

then from each cluster we will select the predominat class of attack

we can see that the classes of attack are reduced to ...

## Plot the features correlation

In [None]:
f,ax = plt.subplots(figsize=(40,32))

data_without_label = data.drop(columns=['Label'])

correlation_matrix = data_without_label.corr()

sns.heatmap(correlation_matrix, annot=True, linewidths=0.5, fmt= '.2f',ax=ax)
plt.show()

Drop the over-correlated features

In [None]:
import numpy as np

abs_correlation_matrix = correlation_matrix.abs()

# Select upper triangle of correlation matrix
upper = abs_correlation_matrix.where(np.triu(np.ones(abs_correlation_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

print(to_drop)

# Drop features 
data.drop(to_drop, axis=1, inplace=True)

data.info()

## Information gain

each features is analized to determine which features are not useful

# Analyze the predominant features distributions

ex: if `unas` is used only to do attacks there is a real world problem, beacuse every time you use `unas` the classificator detects it as an attack

below we are cheking for the `proto` features 

In [None]:
data_summary_protocol = pd.DataFrame(data.proto)
data_summary_protocol.dropna(how="all")
data_summary_protocol = data_summary_protocol.reset_index().groupby("proto").count()
data_summary_protocol.columns.values[0] = "Occurrencies"
data_summary_protocol["Percentage"] = data_summary_protocol.Occurrencies/data_summary_protocol.sum()[0]
data_summary_protocol.sort_values('Occurrencies', ascending=False)

We can see that the protocol occurrencies are mainly composed by tcp and udp, let's focus only on lines corresponding to the attack label.

In [None]:
data_filtered = data[(data.Label == 1)]
plt.figure()
plt.hist(data_filtered.proto, bins = len(data_filtered.proto.value_counts()))
plt.ylabel('Occurrences')
plt.xlabel('Protocols')
plt.xticks(rotation=45)
data_summary_protocol_attack = pd.DataFrame(data_filtered.proto)
data_summary_protocol_attack.dropna(how="all")
data_summary_protocol_attack = data_summary_protocol_attack.reset_index().groupby("proto").count()
data_summary_protocol_attack.columns.values[0] = "Occurrencies"
data_summary_protocol_attack["Percentage"] = data_summary_protocol_attack.Occurrencies/data_summary_protocol_attack.sum()[0]
data_summary_protocol_attack.sort_values('Occurrencies', ascending=False)
