In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

In [2]:
test = pd.read_csv("UNSW_NB15_testing-set.csv", sep=',', header=0)
train = pd.read_csv("UNSW_NB15_training-set.csv", sep=',', header=0)

combined_trainTest = pd.concat([train, test]).drop(['id'], axis=1)

In [3]:
combined_trainTest.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
1,8e-06,udp,-,INT,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
2,5e-06,udp,-,INT,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
3,6e-06,udp,-,INT,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
4,1e-05,udp,-,INT,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [4]:
cols = ['proto', 'service', 'state']
le = preprocessing.LabelEncoder()

combined_trainTest[cols] = combined_trainTest[cols].apply(le.fit_transform)
combined_trainTest.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1.1e-05,119,0,5,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
1,8e-06,119,0,5,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
2,5e-06,119,0,5,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
3,6e-06,119,0,5,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
4,1e-05,119,0,5,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [5]:
X = combined_trainTest.drop(['label', 'attack_cat'], axis=1)
y = combined_trainTest.loc[:, ['label']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=1)
X_train.head(9)

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
98716,0.30219,113,5,4,10,6,902,268,49.637644,254,...,1,1,1,1,0,0,1,1,1,0
37646,0.004814,119,0,2,4,4,568,312,1454.092271,31,...,3,1,1,2,0,0,0,5,2,0
49822,0.552641,113,5,4,10,10,854,1422,34.380368,62,...,1,1,1,2,0,0,1,3,1,0
168484,9e-06,119,2,5,2,0,114,0,111111.1072,254,...,18,17,17,33,0,0,0,18,33,0
156346,9e-06,119,2,5,2,0,114,0,111111.1072,254,...,8,8,8,33,0,0,0,8,33,0
45128,7e-06,119,0,5,2,0,168,0,142857.1409,254,...,3,1,1,1,0,0,0,1,6,0
53325,1.934736,113,9,4,24,20,9008,1872,22.225254,62,...,1,1,1,1,0,0,0,1,1,0
128547,3e-06,119,2,5,2,0,114,0,333333.3215,254,...,17,17,17,29,0,0,0,18,29,0
155807,8e-06,119,2,5,2,0,114,0,125000.0003,254,...,7,7,7,32,0,0,0,8,32,0


In [6]:
n = 40
rfe = RFE(DecisionTreeClassifier(), n).fit(X_train, y_train)

di = np.where(rfe.support_==True)[0]
list = X_train.columns.values[di]
X_train_RFE, X_test_RFE = X_train[list], X_test[list]
print('new shape', X_train_RFE.shape)



new shape (206138, 40)


In [7]:
params = {'max_depth': [2,4,6,8,10], 
          'min_samples_split': [2,3,4,5], 
          'min_samples_leaf': [1,2,4,6,8,10]}

clf = DecisionTreeClassifier()
gs = GridSearchCV(estimator=clf, param_grid=params, scoring="accuracy",
                cv=10, return_train_score=True, verbose=1)
gs.fit(X_train_RFE, y_train)

gs.best_estimator_.fit(X_train_RFE, y_train)
y_pred = gs.best_estimator_.predict(X_test_RFE)
y_true = y_test



Fitting 10 folds for each of 120 candidates, totalling 1200 fits


In [8]:
print("Test accuracy:", metrics.accuracy_score(y_test, y_pred))

Test accuracy: 0.9317745221693995


In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91     18668
           1       0.95      0.94      0.95     32867

    accuracy                           0.93     51535
   macro avg       0.92      0.93      0.93     51535
weighted avg       0.93      0.93      0.93     51535

