### Load Data

In [1]:
import numpy as np
import pandas as pd
import sklearn, imblearn
import time, os, warnings
warnings.filterwarnings('ignore')

In [2]:
# read files from list directory
files = os.listdir('MachineLearningCVE')
files.remove('Monday-WorkingHours.pcap_ISCX.csv')
files.remove('Tuesday-WorkingHours.pcap_ISCX.csv')
files.remove('Wednesday-workingHours.pcap_ISCX.csv')
display(files)
# set option to display all rows & columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
nRowsRead = None
# selected features from feature selection
cols = [' Bwd Packet Length Std',' PSH Flag Count',' min_seg_size_forward',' Min Packet Length',
        ' ACK Flag Count',' Bwd Packet Length Min',' Fwd IAT Std','Init_Win_bytes_forward',
        ' Flow IAT Max',' Bwd Packets/s',' URG Flag Count','Bwd IAT Total',' Label']
df_raw = pd.DataFrame()

for file in files:
    tmp_df = pd.read_csv(os.path.join('MachineLearningCVE/', file))
    df_raw = pd.concat([df_raw,tmp_df])
    df_raw = df_raw[cols]
    del tmp_df

['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
 'Friday-WorkingHours-Morning.pcap_ISCX.csv',
 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv']

In [3]:
# change data type for efficient computational cost
data = df_raw.copy()

for column in data.columns:
    if data[column].dtype == np.int64:
        maxVal = data[column].max()
        if maxVal < 120:
            data[column] = data[column].astype(np.int8)
        elif maxVal < 32767:
            data[column] = data[column].astype(np.int16)
        else:
            data[column] = data[column].astype(np.int32)
            
    if data[column].dtype == np.float64:
        maxVal = data[column].max()
        minVal = data[data[column]>0][column]
        if maxVal < 120 and minVal>0.01 :
            data[column] = data[column].astype(np.float16)
        else:
            data[column] = data[column].astype(np.float32)

attackType = data[' Label'].unique()
data[' Label'] = data[' Label'].astype('category')
data[' Label'] = data[' Label'].astype("category").cat.codes # encode target variables
attackType2 = data[' Label'].unique()

In [4]:
attack_df = pd.DataFrame()
attack_df['decoded'] = attackType
attack_df['encoded'] = attackType2
attack_df

Unnamed: 0,decoded,encoded
0,BENIGN,0
1,DDoS,2
2,PortScan,4
3,Bot,1
4,Infiltration,3
5,Web Attack � Brute Force,5
6,Web Attack � XSS,7
7,Web Attack � Sql Injection,6


In [5]:
# define x and y
y = data[' Label'].copy()
X = data.drop([' Label'],axis=1)

### Handle imbalanced data

In [6]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
# undersample major attacks & oversample minor attacks
und1 = RandomUnderSampler('majority')
X_und1, y_und1 = und1.fit_resample(X, y)

In [7]:
y_und1.value_counts()

4    158930
2    128027
1      1966
5      1507
7       652
3        36
0        21
6        21
Name:  Label, dtype: int64

In [8]:
# set 4 and 2 as major
# set other attacks as minor
df = X_und1
df[' Label'] = y_und1
minor = pd.DataFrame(df[(df[' Label']!=4) & (df[' Label']!=2)])
major = pd.DataFrame(df[(df[' Label']==4) | (df[' Label']==2)])
minor[' Label'].value_counts()

1    1966
5    1507
7     652
3      36
0      21
6      21
Name:  Label, dtype: int64

In [9]:
y_und2 =  minor[' Label']
X_und2 =  minor.drop([' Label'],axis=1)
strategy = {1:2000, 5:1600, 7:800, 3:300, 6:200, 0:200}
sm = SMOTE(sampling_strategy=strategy)
X_sm, y_sm = sm.fit_resample(X_und2, y_und2)
X_minor,y_minor = X_sm, y_sm 

In [10]:
major[' Label'].value_counts()

4    158930
2    128027
Name:  Label, dtype: int64

In [11]:
y_und2 =  major[' Label']
X_und2 =  major.drop([' Label'],axis=1)
strategy = {4:10000, 2:6000}
und3 = RandomUnderSampler(sampling_strategy=strategy)
X_und3, y_und3 = und3.fit_resample(X_und2, y_und2)
y_und3.value_counts()

4    10000
2     6000
Name:  Label, dtype: int64

In [12]:
X_major,y_major = X_und3, y_und3
X,y = pd.concat([X_major,X_minor]), pd.concat([y_major,y_minor])
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21100 entries, 0 to 5099
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0    Bwd Packet Length Std  21100 non-null  float32
 1    PSH Flag Count         21100 non-null  int8   
 2    min_seg_size_forward   21100 non-null  int8   
 3    Min Packet Length      21100 non-null  int16  
 4    ACK Flag Count         21100 non-null  int8   
 5    Bwd Packet Length Min  21100 non-null  int16  
 6    Fwd IAT Std            21100 non-null  float32
 7   Init_Win_bytes_forward  21100 non-null  int32  
 8    Flow IAT Max           21100 non-null  int32  
 9    Bwd Packets/s          21100 non-null  float32
 10   URG Flag Count         21100 non-null  int8   
 11  Bwd IAT Total           21100 non-null  int32  
dtypes: float32(3), int16(2), int32(3), int8(4)
memory usage: 824.2 KB


### ML Models

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import tree
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

scaler = StandardScaler()
# extract numerical attributes
cols = X.select_dtypes(include=['float32','float16','int32','int16','int8']).columns
# scale to have zero mean & unit variance  
train_X = scaler.fit_transform(X.select_dtypes(include=['float32','float16','int32','int16','int8']))
X_train,X_test,Y_train,Y_test = train_test_split(train_X,y,train_size=0.70, random_state=2)

# Train SVM Model
start_svm = time.time()
svm = SVC()
svm.fit(X_train, Y_train)
print ('Support Vector Classifier trained')
print('> Elapsed time: ',time.time()-start_svm)

# Train Decision Tree Model
start_dt = time.time()
dt = tree.DecisionTreeClassifier(criterion='gini', max_depth=33, random_state=20, max_features=12, splitter='random')
dt.fit(X_train, Y_train)
print ('Decision Tree Classifier trained')
print('> Elapsed time: ',time.time()-start_dt)

# Train Random Forest Model
start_rf = time.time()
rf = RandomForestClassifier(max_depth=40)
rf.fit(X_train, Y_train)
print ('Random Forest Classifier trained')
print('> Elapsed time: ',time.time()-start_rf)

# Train Gradient Boosting Model
start_gb = time.time()
gb = GradientBoostingClassifier()
gb.fit(X_train, Y_train)
print ('Gradient Boosting Classifier trained')
print('> Elapsed time: ',time.time()-start_gb)

# Train XGBoost Model
start_xgb = time.time()
xgb = XGBClassifier(eval_metric='mlogloss')
xgb.fit(X_train,Y_train)
print ('XGBoost Classifier trained')
print('> Elapsed time: ',time.time()-start_xgb)

Support Vector Classifier trained
> Elapsed time:  0.34607744216918945
Decision Tree Classifier trained
> Elapsed time:  0.008002281188964844
Random Forest Classifier trained
> Elapsed time:  0.5631263256072998
Gradient Boosting Classifier trained
> Elapsed time:  10.085254669189453
XGBoost Classifier trained
> Elapsed time:  1.4473233222961426


In [14]:
# model evaluation (train)
from sklearn import metrics

models = []
models.append(('Support Vector Classifier',svm))
models.append(('Decision Tree Classifier',dt))
models.append(('Random Forest Classifier',rf))
models.append(('Gradient Boosting Classifier',gb))
models.append(('XGBoost Classifier',xgb))

for i, v in models:
    Xpred =  v.predict(X_train)
    scores = cross_val_score(v, X_train, Y_train, cv=10) # k=10 fold cross validation
    accuracy = metrics.accuracy_score(Y_train, Xpred)
    confusion_matrix = metrics.confusion_matrix(Y_train, Xpred)
    classification = metrics.classification_report(Y_train, Xpred)
    print()
    print('========================== {} Model Evaluation =========================='.format(i))
    print()
    print ('Cross Validation Mean Score: ',scores.mean())
    print()
    print ('Model Accuracy: ',accuracy)
    print()
    print('Confusion matrix:' '\n',confusion_matrix)
    print()
    print('Classification report:' '\n',classification) 
    print()



Cross Validation Mean Score:  0.9532807015612471

Model Accuracy:  0.9549732547904395

Confusion matrix:
 [[ 129    1    0    6    1    1    1    1]
 [   0 1350    6    0    0    1    0    2]
 [   0    1 4231    0    0    0    0    0]
 [   1    1   18  177    0    0    0    0]
 [   3    0    0    0 7024    0    6    1]
 [   0    0    0    0    0 1103    2    0]
 [   2    0    0    0    0   78   62    0]
 [   0    0    0    0    0  527    5   28]]

Classification report:
               precision    recall  f1-score   support

           0       0.96      0.92      0.94       140
           1       1.00      0.99      1.00      1359
           2       0.99      1.00      1.00      4232
           3       0.97      0.90      0.93       197
           4       1.00      1.00      1.00      7034
           5       0.65      1.00      0.78      1105
           6       0.82      0.44      0.57       142
           7       0.88      0.05      0.09       560

    accuracy                      

In [15]:
# model evaluation (test)
for i, v in models:
    pred = v.predict(X_test)
    accuracy = metrics.accuracy_score(Y_test,pred)
    confusion_matrix = metrics.confusion_matrix(Y_test, pred)
    classification = metrics.classification_report(Y_test, pred)
    print()
    print('============================== {} Model Test Results =============================='.format(i))
    print()
    print ('Model Accuracy: ',accuracy)
    print()
    print('Confusion matrix:' '\n',confusion_matrix)
    print()
    print('Classification report:' '\n',classification) 
    print()    



Model Accuracy:  0.955141367872374

Confusion matrix:
 [[  53    1    0    4    0    2    0    0]
 [   0  640    0    0    0    1    0    0]
 [   0    0 1768    0    0    0    0    0]
 [   3    1   11   88    0    0    0    0]
 [   3    0    0    0 2961    1    1    0]
 [   0    0    0    0    0  492    3    0]
 [   1    0    0    0    0   25   32    0]
 [   0    0    0    0    1  223    3   13]]

Classification report:
               precision    recall  f1-score   support

           0       0.88      0.88      0.88        60
           1       1.00      1.00      1.00       641
           2       0.99      1.00      1.00      1768
           3       0.96      0.85      0.90       103
           4       1.00      1.00      1.00      2966
           5       0.66      0.99      0.79       495
           6       0.82      0.55      0.66        58
           7       1.00      0.05      0.10       240

    accuracy                           0.96      6331
   macro avg       0.91      0.

## Ensemble (Voting) Model

In [16]:
from sklearn.ensemble import VotingClassifier
# hard vote 5 classifiers
clf1 = SVC()
clf2 = DecisionTreeClassifier(criterion='gini', max_depth=33, random_state=20, max_features=12, splitter='random')
clf3 = RandomForestClassifier(criterion='gini', max_depth=40, random_state=20)
clf4 = GradientBoostingClassifier()
clf5 = XGBClassifier(eval_metric='mlogloss')

vt = VotingClassifier(estimators=[('svm',clf1), ('dt', clf2),('rf',clf3),
                                       ('gb',clf4),('xgb',clf5)],voting='hard',
                           weights=[1,2,2,1,1],flatten_transform=True)
vt.fit(X_train,Y_train)

VotingClassifier(estimators=[('svm', SVC()),
                             ('dt',
                              DecisionTreeClassifier(max_depth=33,
                                                     max_features=12,
                                                     random_state=20,
                                                     splitter='random')),
                             ('rf',
                              RandomForestClassifier(max_depth=40,
                                                     random_state=20)),
                             ('gb', GradientBoostingClassifier()),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,...
                                            interaction_constraints=None,
     

In [17]:
pred = vt.predict(X_test)
accuracy = metrics.accuracy_score(Y_test,pred)
confusion_matrix = metrics.confusion_matrix(Y_test,pred)
classification = metrics.classification_report(Y_test,pred)
print()
print('============================== {} Model Test Results =============================='.format('Voting Classifier'))
print()
print ('Model Accuracy: ' '\n',accuracy)
print()
print('Confusion matrix:' '\n',confusion_matrix)
print()
print('Classification report:' '\n',classification) 
print()    



Model Accuracy: 
 0.9622492497235824

Confusion matrix:
 [[  57    0    0    1    0    2    0    0]
 [   0  641    0    0    0    0    0    0]
 [   0    1 1767    0    0    0    0    0]
 [   0    1    0  102    0    0    0    0]
 [   0    0    0    0 2965    1    0    0]
 [   0    0    0    0    0  416    2   77]
 [   0    0    0    0    0    1   57    0]
 [   0    0    0    0    1  149    3   87]]

Classification report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97        60
           1       1.00      1.00      1.00       641
           2       1.00      1.00      1.00      1768
           3       0.99      0.99      0.99       103
           4       1.00      1.00      1.00      2966
           5       0.73      0.84      0.78       495
           6       0.92      0.98      0.95        58
           7       0.53      0.36      0.43       240

    accuracy                           0.96      6331
   macro avg       0.90      

### Save Voted Model

In [18]:
import pickle
pickle.dump(vt, open('voted_model.pkl','wb'))
pickle.dump(svm, open('svm_model.pkl','wb'))
pickle.dump(dt, open('dt_model.pkl','wb'))
pickle.dump(rf, open('rf_model.pkl','wb'))
pickle.dump(gb, open('gb_model.pkl','wb'))
pickle.dump(xgb, open('xgb_model.pkl','wb'))

In [19]:
#!jt -t monokai -f roboto -T -N
#!jt -r