In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import plot_importance


In [4]:
#read data

In [5]:
df = pd.read_csv('./CICIDS_2018/IDS.csv',index_col=[0])


In [6]:
df

Unnamed: 0,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
5631664,2116.0,1.0,1.0,44.0,82.0,44.0,44.0,44.0,0.0,82.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3099750,1057.0,1.0,1.0,41.0,103.0,41.0,41.0,41.0,0.0,103.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1825967,66555644.0,35.0,46.0,3886.0,5154.0,583.0,0.0,111.028571,174.330735,1430.0,...,20.0,8191292.0,0.0,8191292.0,8191292.0,58302685.0,0.0,58302685.0,58302685.0,Benign
7113878,2462909.0,8.0,8.0,1032.0,1466.0,565.0,0.0,129.0,190.919579,1149.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1566689,322.0,1.0,1.0,47.0,63.0,47.0,47.0,47.0,0.0,63.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420640,18184,1,1,31,87,31,31,31,0,87,...,8,0,0,0,0,0,0,0,0,Infilteration
490836,14208808,11,8,674,5918,517,0,61.27272727,155.8294522,1460,...,20,3051885,0,3051885,3051885,6014723,0,6014723,6014723,Infilteration
537625,5124568,3,1,0,0,0,0,0,0,0,...,20,0,0,0,0,0,0,0,0,Infilteration
21838,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label


In [7]:
df.drop(df.loc[df['Label'] == 'Label'].index, inplace=True)

In [8]:
df.Label.value_counts()

Benign                      674154
DDOS attack-HOIC             34301
DDoS attacks-LOIC-HTTP       28810
DoS attacks-Hulk             23096
Bot                          14116
FTP-BruteForce                9668
SSH-Bruteforce                9379
Infilteration                 8096
DoS attacks-SlowHTTPTest      6994
DoS attacks-GoldenEye         2075
DoS attacks-Slowloris          550
DDOS attack-LOIC-UDP            86
Brute Force -Web                30
Brute Force -XSS                12
SQL Injection                    5
Name: Label, dtype: int64

In [9]:
df.keys()

Index(['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
       'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
       'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s',
       'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len',
       'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min',
       'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var',
       'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt',
       'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt',
       'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
   

In [10]:
len(df.keys())

77

### Preprocessing (normalization and padding values)

In [11]:
# Z-score normalization
features = df.dtypes[df.dtypes != 'object'].index
df[features] = df[features].apply(
    lambda x: (x - x.mean()) / (x.std()))
print(df[features])
# Fill empty values by 0
df = df.fillna(0)


Empty DataFrame
Columns: []
Index: [5631664, 3099750, 1825967, 7113878, 1566689, 3017384, 1493908, 7158902, 6684579, 6171372, 5326444, 5347845, 2105492, 7375223, 5748700, 1579014, 4678832, 2142754, 2305699, 7836771, 4585742, 6076334, 5665882, 6868035, 4267596, 3269305, 1183089, 6117937, 5262864, 4227676, 6951291, 1211081, 5456307, 1846999, 1650757, 4442592, 7612948, 6423481, 7492520, 7495873, 6531205, 3721740, 6852237, 6712439, 811817, 7471149, 1242482, 7116350, 7366450, 1406973, 4345670, 4111254, 6745164, 7015311, 2437330, 838989, 2926643, 2780769, 2018658, 7270385, 4634818, 5776739, 1835473, 7691565, 5541528, 5529714, 5947289, 1406076, 7218096, 3148913, 4800421, 6896166, 1141493, 7394795, 4297270, 3402602, 3932155, 4178470, 4152546, 4708787, 6641804, 4200069, 3447542, 5311178, 6407612, 5614790, 1243037, 1318612, 1222617, 7260229, 6455735, 6404094, 6734752, 2820393, 3300673, 1663437, 1410728, 2281388, 1270764, 2896832, ...]

[811372 rows x 0 columns]


In [12]:
print(df[features])

Empty DataFrame
Columns: []
Index: [5631664, 3099750, 1825967, 7113878, 1566689, 3017384, 1493908, 7158902, 6684579, 6171372, 5326444, 5347845, 2105492, 7375223, 5748700, 1579014, 4678832, 2142754, 2305699, 7836771, 4585742, 6076334, 5665882, 6868035, 4267596, 3269305, 1183089, 6117937, 5262864, 4227676, 6951291, 1211081, 5456307, 1846999, 1650757, 4442592, 7612948, 6423481, 7492520, 7495873, 6531205, 3721740, 6852237, 6712439, 811817, 7471149, 1242482, 7116350, 7366450, 1406973, 4345670, 4111254, 6745164, 7015311, 2437330, 838989, 2926643, 2780769, 2018658, 7270385, 4634818, 5776739, 1835473, 7691565, 5541528, 5529714, 5947289, 1406076, 7218096, 3148913, 4800421, 6896166, 1141493, 7394795, 4297270, 3402602, 3932155, 4178470, 4152546, 4708787, 6641804, 4200069, 3447542, 5311178, 6407612, 5614790, 1243037, 1318612, 1222617, 7260229, 6455735, 6404094, 6734752, 2820393, 3300673, 1663437, 1410728, 2281388, 1270764, 2896832, ...]

[811372 rows x 0 columns]


### Data sampling

In [13]:
labelencoder = LabelEncoder()
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])

In [14]:
df.Label.value_counts()

0     674154
4      34301
6      28810
8      23096
1      14116
11      9668
14      9379
12      8096
9       6994
7       2075
10       550
5         86
2         30
3         12
13         5
Name: Label, dtype: int64

In [15]:
df = df[~(df.isin([np.inf, -np.inf]).any(axis=1))]

In [16]:
# retain the minority class instances
df_minor = df[(df['Label'] == 0) | (df['Label'] == 4) | (df['Label'] == 6) | (df['Label'] == 8) | (
    df['Label'] == 1) | (df['Label'] == 11) | (df['Label'] == 14) | (df['Label'] == 12) | (df['Label'] == 9) | (df['Label'] == 7) ]
df_major = df.drop(df_minor.index)


In [17]:
X = df_major.drop(['Label'],axis=1)
X = X[X.replace([np.inf, -np.inf], np.nan).notnull().all(axis=1)]
X = X.astype(np.float64, copy=False)

In [18]:
y = df_major.iloc[:, -1].values.reshape(-1, 1)
y = np.ravel(y)
print(y)

[10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10

In [19]:
X[X > 1e308] = 0

In [20]:
print(np.any(np.isinf(X)))

False


In [21]:
# use k-means to cluster the data samples and select a proportion of data from each cluster
from sklearn.cluster import KMeans
# from default to the number of labels
range_n_clusters = [4,5,6,7,8,9]
silhouette_avg = []
for num_clusters in range_n_clusters:
    # initialise kmeans 
    kmeans = KMeans(init='k-means++', n_clusters=num_clusters)
    kmeans.fit(X)
    cluster_labels = kmeans.labels_
    # Compute the silhouette scores for each sample
    silhouette_avg.append(silhouette_score(X, cluster_labels))
    
for i in range(len(silhouette_avg)):
    print(
        "for n_clusters : ",
        range_n_clusters[i],
        ", the average silhouette_score is :",
        silhouette_avg[i]
    )
#best score is while n_clusters=8

for n_clusters :  4 , the average silhouette_score is : 0.8728500172565108
for n_clusters :  5 , the average silhouette_score is : 0.8922766082468729
for n_clusters :  6 , the average silhouette_score is : 0.9064962836244408
for n_clusters :  7 , the average silhouette_score is : 0.9160997257602449
for n_clusters :  8 , the average silhouette_score is : 0.9216679979587075
for n_clusters :  9 , the average silhouette_score is : 0.905529684736829


In [22]:
kmeans = KMeans(init='k-means++', n_clusters=8)
kmeans.fit_predict(X)

array([1, 1, 1, 1, 2, 0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 1, 0, 1, 2, 1, 0,
       6, 0, 0, 2, 1, 1, 2, 2, 2, 2, 0, 1, 1, 2, 1, 0, 6, 2, 0, 0, 2, 2,
       0, 0, 1, 1, 0, 1, 1, 1, 2, 1, 0, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 1,
       2, 2, 2, 0, 1, 1, 1, 1, 0, 0, 4, 1, 1, 6, 1, 0, 0, 1, 2, 0, 6, 0,
       2, 0, 2, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 2, 1, 0, 2,
       1, 0, 0, 2, 1, 1, 0, 1, 1, 1, 0, 0, 2, 2, 0, 1, 0, 2, 1, 1, 2, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 2, 0, 0, 1, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 5, 1, 0, 1, 2, 1, 5, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 2, 0, 2, 1, 0, 1, 0, 0, 2,
       0, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 0, 0, 0, 6, 2, 0, 0,
       2, 1, 1, 2, 1, 1, 1, 0, 1, 0, 1, 2, 2, 0, 0, 1, 0, 0, 7, 1, 1, 2,
       1, 2, 2, 2, 0, 2, 1, 0, 1, 2, 0, 1, 0, 0, 0, 0, 1, 0, 2, 1, 1, 0,
       0, 0, 2, 1, 0, 0, 0, 1, 0, 2, 5, 2, 2, 2, 0, 5, 1, 1, 4, 0, 2, 6,
       0, 0, 0, 2, 0, 0, 6, 0, 2, 2, 1, 0, 1, 0, 0,

In [23]:
klabel = kmeans.labels_
df_major['klabel'] = klabel

In [24]:
df_major['klabel'].value_counts()

1    156
0    143
2     79
3     54
6     16
7      8
5      5
4      4
Name: klabel, dtype: int64

In [25]:
cols = list(df_major)
cols.insert(78, cols.pop(cols.index('Label')))
df_major = df_major.loc[:, cols]

In [26]:
df_major

Unnamed: 0,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,klabel,Label
45582,3019345,3,0,0,0,0,0,0.0,0.0,0,...,0.0,0.0,0,0,0.0,0.0,0,0,1,10
45980,3023856,3,0,0,0,0,0,0.0,0.0,0,...,0.0,0.0,0,0,0.0,0.0,0,0,1,10
43613,3027622,3,0,0,0,0,0,0.0,0.0,0,...,0.0,0.0,0,0,0.0,0.0,0,0,1,10
35702,3043562,3,0,0,0,0,0,0.0,0.0,0,...,0.0,0.0,0,0,0.0,0.0,0,0,1,10
30816,99999651,2,2,16,0,8,8,8.0,0.0,0,...,3.0,0.0,3,3,99999645.0,0.0,99999645,99999645,2,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,5005771,5,3,646,364,646,0,129.2,288.899983,364,...,0.0,0.0,0,0,0.0,0.0,0,0,1,2
1229,586,2,0,0,0,0,0,0.0,0.0,0,...,0.0,0.0,0,0,0.0,0.0,0,0,1,3
1075,137,2,0,0,0,0,0,0.0,0.0,0,...,0.0,0.0,0,0,0.0,0.0,0,0,1,3
1189,104,2,0,0,0,0,0,0.0,0.0,0,...,0.0,0.0,0,0,0.0,0.0,0,0,1,3


In [27]:
def typicalSampling(group):
    name = group.name
    frac = 100
    return group.sample(frac=frac, replace= True)

result = df_major.groupby('klabel', group_keys=False
).apply(typicalSampling)


In [28]:
result['Label'].value_counts()

10    37758
5      5507
2      2531
3       614
13       90
Name: Label, dtype: int64

In [29]:
result = result.drop(['klabel'], axis=1)
result = result.append(df_minor)

In [30]:
result

Unnamed: 0,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
44467,108157744,15,3,2530,0,230,0,168.666667,105.279673,0,...,32,6026774.5,2517471.967972,7806896,4246653,24026048.5,20588974.214381,53247620,6655677,10
46328,108175708,15,3,2380,0,238,0,158.666667,114.994824,0,...,32,6021255.5,2541167.11621,7818132,4224379,24033298.75,20579663.385453,53237413,6672155,10
48631,107358146,15,3,2530,0,230,0,168.666667,105.279673,0,...,32,6154189.5,2292457.862276,7775202,4533177,23762441.25,20651261.873011,53247471,6655999,10
39964,107678289,15,3,2530,0,230,0,168.666667,105.279673,0,...,32,6154751.0,2019564.84932,7582799,4726703,23842195.75,20643352.585955,53247670,6655969,10
33526,107260041,15,3,2530,0,230,0,168.666667,105.279673,0,...,32,6174286.0,2165678.566157,7705652,4642920,23727866.5,20656687.111012,53242903,6627081,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
469282,628,1,1,32,48,32,32,32.0,0.0,48,...,8,0.0,0.0,0,0,0.0,0.0,0,0,12
504000,1393190,8,7,1148,1581,677,0,143.5,228.1296624,1173,...,20,0.0,0.0,0,0,0.0,0.0,0,0,12
420640,18184,1,1,31,87,31,31,31,0,87,...,8,0,0,0,0,0,0,0,0,12
490836,14208808,11,8,674,5918,517,0,61.27272727,155.8294522,1460,...,20,3051885,0,3051885,3051885,6014723,0,6014723,6014723,12


In [31]:
print(result[features])

Empty DataFrame
Columns: []
Index: [44467, 46328, 48631, 39964, 33526, 36901, 50824, 39822, 47925, 52193, 37644, 47597, 37644, 40096, 51604, 36228, 41505, 53003, 36121, 33425, 50247, 53893, 51519, 37792, 39964, 45052, 34147, 38980, 37496, 38271, 39975, 51012, 33513, 42583, 38383, 47597, 50700, 50247, 39964, 36747, 51292, 51826, 33702, 54422, 36467, 39338, 36418, 38383, 47925, 39964, 40798, 35860, 36228, 33513, 42583, 50636, 35371, 49147, 52165, 34794, 53374, 50247, 41304, 39964, 48949, 37033, 50744, 36901, 52193, 33546, 50636, 43552, 39096, 44467, 36798, 45831, 39338, 38008, 50003, 44073, 37976, 37171, 45763, 33418, 33717, 38008, 35860, 44460, 36159, 34486, 51042, 33513, 34513, 47844, 52964, 52165, 50824, 34513, 38678, 36747, ...]

[852454 rows x 0 columns]


In [32]:
result.to_csv('./CICIDS_2018/CICIDS2018_cleaned.csv', index=0)

## Read the cleaned CICIDS2018 dataset
Due to the large size of this dataset, the sampled subsets of CICIDS2018 is used. The subsets are in the "CICIDS_2018" folder.

In [33]:
#Read dataset
df = pd.read_csv('./CICIDS_2018/CICIDS2018_cleaned.csv')

In [34]:
def typicalSampling(group):
    name = group.name
    frac = 0.5
    return group.sample(frac=frac, replace=True)


df_frac = df.apply(typicalSampling)

ValueError: cannot reindex from a duplicate axis

In [None]:
df.keys()

In [None]:
df.Label.value_counts()

In [None]:
df[df > 1e308] = 0

In [None]:
X = df.drop(['Label'], axis=1).values
print(X)
y = df.iloc[:, -1].values.reshape(-1, 1)
y = np.ravel(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size=0.7,
    test_size=0.3,
    random_state=0,
    stratify=y
)

## Feature engineering
### Feature selection by information gain

In [None]:
from sklearn.feature_selection import mutual_info_classif
importances = mutual_info_classif(X_train, y_train)

In [None]:
print(importances)


In [None]:
print(features)

In [None]:
# calculate the sum of importance scores
f_list = sorted(
    zip(map(lambda x: round(x, 4), importances),
    features), reverse=True)
Sum = 0
fs = []
print(f_list)
for i in range(0, len(f_list)):
    Sum = Sum + f_list[i][0]
    fs.append(f_list[i][1])

In [None]:
# select the important features from top to bottom until the accumulated importance reaches 95%
f_list2 = sorted(
    zip(map(lambda x: round(x, 4), importances/Sum), features), reverse=True)
Sum2 = 0
fs = []
for i in range(0, len(f_list2)):
    Sum2 = Sum2 + f_list2[i][0]
    fs.append(f_list2[i][1])
    if Sum2 >= 0.95:
        break

In [None]:
X_fs = df[fs].values

In [None]:
X_fs.shape