In [56]:
# import libraries
#import boto3, re, sys, math, json, os, sagemaker, 
import urllib.request
import numpy as np                                
import pandas as pd  
import matplotlib
import matplotlib.pyplot as plt  
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime       
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import neighbors
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.mixture import GaussianMixture
#from sagemaker.predictor import csv_serializer  
#from sagemaker import get_execution_role

# Global variables
Data_Download_Completed = True
Use_Small_DataSet = False
Small_Data_Already_Split = True
Dominate_Component_Number = 10
Feature_Weight_Step_Threshold = 0.01

In [57]:
# Grab the necessary dataset files from the origin sources
data_dir="/home/will/Desktop/CSC8515/"
unsw_nb15_mainurl = 'https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/'
unsw_nb15_training_file = 'UNSW_NB15_training-set.csv'
unsw_nb15_feature_file = 'NUSW-NB15_features.csv' # misspelling in the origin data file name
unsw_nb15_test_file = 'UNSW_NB15_testing-set.csv'
download_training_url = unsw_nb15_mainurl + unsw_nb15_training_file
download_feature_url = unsw_nb15_mainurl + unsw_nb15_feature_file
download_test_url = unsw_nb15_mainurl + unsw_nb15_test_file

if(Data_Download_Completed !=True):
    try:
      urllib.request.urlretrieve (download_feature_url, unsw_nb15_feature_file)
      print('Success: ' + unsw_nb15_feature_file)
    except Exception as e:
      print('Data load error: ',e)

    try:
      urllib.request.urlretrieve (download_training_url, unsw_nb15_training_file)
      print('Success: ' + unsw_nb15_training_file)
    except Exception as e:
      print('Data load error: ',e)


In [58]:
# Load data
# Must declare data_dir as the directory of training and test files
raw_data_filename = data_dir + unsw_nb15_training_file
if(Use_Small_DataSet !=True):
    print('Loading raw data')
    try:
        raw_data = pd.read_csv(raw_data_filename, index_col=0, header=0)
        print('Success: Data loaded into dataframe.')
    except Exception as e:
        print('Data load error: ',e)
    selected_data = raw_data
    print(selected_data.shape)
    
    print('Loading test data')
    try:
        test_data = pd.read_csv(unsw_nb15_test_file, index_col=0, header=0)
        print('Success: Data loaded into dataframe.')
    except Exception as e:
        print('Data load error: ',e)
    print(test_data.shape)


Loading raw data
Success: Data loaded into dataframe.
(82332, 44)
Loading test data
Success: Data loaded into dataframe.
(175341, 44)


In [59]:
    selected_data.head()

Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
4,6e-06,udp,-,INT,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [60]:
# Now gather the list of feature names
print('Loading feature list')
try:
    feature_data = pd.read_csv(raw_data_filename, index_col=0, header=None, nrows=1)
    print('Success: Feature list loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)
#print(feature_data)
feature_list = feature_data.values[0]
feature_only_list = np.delete(feature_list, [(feature_list.shape[0]-2), (feature_list.shape[0]-1)])
print(feature_only_list.shape)
feature_only_list

Loading feature list
Success: Feature list loaded into dataframe.
(42,)


array(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss',
       'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb',
       'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean',
       'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl',
       'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports'], dtype=object)

In [61]:
RFE_feature_list = ['state', 'dttl', 'dload', 'swin', 'dwin', 'synack', 'ct_state_ttl', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_srv_dst', 'proto', 'dbytes', 'dloss', 'ct_src_dport_ltm', 'spkts', 'sbytes', 'sttl', 'ct_srv_src', 'sjit', 'dmean', 'sloss', 'smean', 'trans_depth', 'dur']
Ultra_short_feature_list = ['ct_state_ttl', 'dbytes', 'dmean', 'dttl', 'dur', 'sbytes', 'state', 'sttl', 'synack']

In [62]:
print('Transforming data')
# Factorize columns: "proto", "service", "state", "attack_cat"
selected_data['proto'], protocols = pd.factorize(selected_data['proto'])
selected_data['service'], services = pd.factorize(selected_data['service'])
selected_data['state'], states    = pd.factorize(selected_data['state'])
#selected_data['attack_cat'], attacks = pd.factorize(selected_data['attack_cat'])
selected_data.head()

Transforming data


Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.1e-05,0,0,0,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
2,8e-06,0,0,0,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
3,5e-06,0,0,0,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
4,6e-06,0,0,0,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
5,1e-05,0,0,0,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [63]:
selected_data.describe()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
count,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,...,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0
mean,1.006756,8.811216,1.901739,0.862046,18.666472,17.545936,7993.908,13233.79,82410.89,180.967667,...,4.928898,3.663011,7.45636,0.008284,0.008381,0.129743,6.46836,9.164262,0.011126,0.5506
std,4.710444,25.348181,2.642982,1.087298,133.916353,115.574086,171642.3,151471.5,148620.4,101.513358,...,8.389545,5.915386,11.415191,0.091171,0.092485,0.638683,8.543927,11.121413,0.104891,0.497436
min,0.0,0.0,0.0,0.0,1.0,0.0,24.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,8e-06,0.0,0.0,0.0,2.0,0.0,114.0,0.0,28.60611,62.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0
50%,0.014138,2.0,0.0,1.0,6.0,2.0,534.0,178.0,2650.177,254.0,...,1.0,1.0,3.0,0.0,0.0,0.0,3.0,5.0,0.0,1.0
75%,0.71936,2.0,6.0,1.0,12.0,10.0,1280.0,956.0,111111.1,254.0,...,4.0,3.0,6.0,0.0,0.0,0.0,7.0,11.0,0.0,1.0
max,59.999989,130.0,12.0,6.0,10646.0,11018.0,14355770.0,14657530.0,1000000.0,255.0,...,59.0,38.0,63.0,2.0,2.0,16.0,60.0,62.0,1.0,1.0


In [64]:
print('Transforming test data')
# Factorize columns: "proto", "service", "state", "attack_cat"
test_data['proto'], protocols = pd.factorize(test_data['proto'])
test_data['service'], services = pd.factorize(test_data['service'])
test_data['state'], states    = pd.factorize(test_data['state'])
#selected_data['attack_cat'], attacks = pd.factorize(selected_data['attack_cat'])
test_data.head()

Transforming test data


Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.121478,0,0,0,6,4,258,172,74.08749,252,...,1,1,0,0,0,1,1,0,Normal,0
2,0.649902,0,0,0,14,38,734,42014,78.473372,62,...,1,2,0,0,0,1,6,0,Normal,0
3,1.623129,0,0,0,8,16,364,13186,14.170161,62,...,1,3,0,0,0,2,6,0,Normal,0
4,1.681642,0,1,0,12,12,628,770,13.677108,62,...,1,3,1,1,0,2,1,0,Normal,0
5,0.449454,0,0,0,10,6,534,268,33.373826,254,...,1,40,0,0,0,2,39,0,Normal,0


In [65]:
test_data.describe()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
count,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,...,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0,175341.0
mean,1.359389,7.553048,2.357863,0.667357,20.298664,18.969591,8844.844,14928.92,95406.19,179.546997,...,5.383538,4.206255,8.729881,0.014948,0.014948,0.133066,6.955789,9.100758,0.015752,0.680622
std,6.480249,21.450758,2.74943,0.720333,136.887597,110.258271,174765.6,143654.2,165401.0,102.940011,...,8.047104,5.783585,10.956186,0.126048,0.126048,0.701208,8.321493,10.756952,0.124516,0.466237
min,0.0,0.0,0.0,0.0,1.0,0.0,28.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,8e-06,0.0,0.0,0.0,2.0,0.0,114.0,0.0,32.78614,62.0,...,1.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0
50%,0.001582,1.0,0.0,1.0,2.0,2.0,430.0,164.0,3225.807,254.0,...,1.0,1.0,3.0,0.0,0.0,0.0,3.0,4.0,0.0,1.0
75%,0.668069,1.0,6.0,1.0,12.0,10.0,1418.0,1102.0,125000.0,254.0,...,5.0,3.0,12.0,0.0,0.0,0.0,9.0,12.0,0.0,1.0
max,59.999989,132.0,12.0,8.0,9616.0,10974.0,12965230.0,14655550.0,1000000.0,255.0,...,51.0,46.0,65.0,4.0,4.0,30.0,60.0,62.0,1.0,1.0


In [66]:
# Select RFE based features and labels 
X_train= selected_data[RFE_feature_list]
#labels= selected_data.iloc[:,selected_data.shape[1]-1:]
y_train= selected_data['label']
print('features shape is:', X_train.shape, 'labels shape is:', y_train.shape)
y_train.head(10)

features shape is: (82332, 24) labels shape is: (82332,)


id
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
Name: label, dtype: int64

In [67]:
# Select RFE based tst data features and labels 
X_test= test_data[RFE_feature_list]
y_test= test_data['label']
print('features shape is:', X_test.shape, 'labels shape is:', y_test.shape)
X_test.head(10)
print('y_test shape is:', y_test.shape, 'labels shape is:', y_test.shape)
y_test.head(10)

features shape is: (175341, 24) labels shape is: (175341,)
y_test shape is: (175341,) labels shape is: (175341,)


id
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
Name: label, dtype: int64

In [68]:
# Training, choose model by commenting/uncommenting clf= Random Forest
print('Training model...')
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)
#, max_features=0.8, min_samples_leaf=3, n_estimators=500, min_samples_split=3, random_state=10, verbose=1)
# clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, presort=False)
# clf= svm.SVC(kernel='rbf')
#clf= neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))


Training model...
Predicting...

Confusion matrix:
 [[ 54574   1426]
 [ 15098 104243]]
Computing performance metrics

Accuracy result: 0.9057607747189762

Classification report:
             precision    recall  f1-score   support

          0       0.78      0.97      0.87     56000
          1       0.99      0.87      0.93    119341

avg / total       0.92      0.91      0.91    175341



In [69]:
# Predicting RFE features, 2 class
y_pred = clf.predict(X_test)


In [70]:
y_train= selected_data['attack_cat']
y_test= test_data['attack_cat']
print('features shape is:', y_train.shape, 'labels shape is:', y_test.shape)
y_train.head(10)


features shape is: (82332,) labels shape is: (175341,)


id
1     Normal
2     Normal
3     Normal
4     Normal
5     Normal
6     Normal
7     Normal
8     Normal
9     Normal
10    Normal
Name: attack_cat, dtype: object

In [71]:
# Training, choose model by commenting/uncommenting clf= Random Forest
print('Training model...')
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)
#, max_features=0.8, min_samples_leaf=3, n_estimators=500, min_samples_split=3, random_state=10, verbose=1)
# clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, presort=False)
# clf= svm.SVC(kernel='rbf')
#clf= neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))


Training model...
Predicting...

Confusion matrix:
 [[    0     0   981   340    10   129   539     1     0     0]
 [    0    89   963   468    39   116    53    10     8     0]
 [    0     0  7043  3543   216   912   455    48    47     0]
 [    0     0  8835 20807   494  1336  1444   398    75     4]
 [    0     0   979   580  2114   149 14308    13    41     0]
 [    0     0   225   235    31 39406    83     4    15     1]
 [    0     0    18   333   580     4 55012    43    10     0]
 [    0     2  1162  1314    57   155   261  7519    21     0]
 [    0     0    62   139    40    12   317    42   521     0]
 [    0     0     1    88     3     2    16     0     0    20]]
Computing performance metrics

Accuracy result: 0.7558471777850018

Classification report:
                precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00      2000
      Backdoor       0.98      0.05      0.10      1746
           DoS       0.35      0.57      0.43     12264
 

  'precision', 'predicted', average, warn_for)


In [73]:
# Predicting, RFE feature, 10 class labels
y_pred = clf.predict(X_test)

In [74]:
# Re-run bsaeline methods on the test data
X_train= selected_data.iloc[:,:selected_data.shape[1]-2]
X_test= test_data.iloc[:,:test_data.shape[1]-2]
print('features shape is:', X_train.shape, 'test shape is:', X_test.shape)
X_test.head(10)

features shape is: (82332, 42) test shape is: (175341, 42)


Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.121478,0,0,0,6,4,258,172,74.08749,252,...,1,1,1,1,0,0,0,1,1,0
2,0.649902,0,0,0,14,38,734,42014,78.473372,62,...,1,1,1,2,0,0,0,1,6,0
3,1.623129,0,0,0,8,16,364,13186,14.170161,62,...,2,1,1,3,0,0,0,2,6,0
4,1.681642,0,1,0,12,12,628,770,13.677108,62,...,2,1,1,3,1,1,0,2,1,0
5,0.449454,0,0,0,10,6,534,268,33.373826,254,...,2,2,1,40,0,0,0,2,39,0
6,0.380537,0,0,0,10,6,534,268,39.41798,254,...,2,2,1,40,0,0,0,2,39,0
7,0.637109,0,0,0,10,8,534,354,26.683033,254,...,1,1,1,40,0,0,0,1,39,0
8,0.521584,0,0,0,10,8,534,354,32.593026,254,...,3,3,1,40,0,0,0,3,39,0
9,0.542905,0,0,0,10,8,534,354,31.313031,254,...,3,3,1,40,0,0,0,3,39,0
10,0.258687,0,0,0,10,6,534,268,57.985135,254,...,3,3,1,40,0,0,0,3,39,0


In [75]:
y_train= selected_data['label']
y_test= test_data['label']

In [76]:
# Training, choose model by commenting/uncommenting clf= Random Forest
print('Training model...')
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)
#, max_features=0.8, min_samples_leaf=3, n_estimators=500, min_samples_split=3, random_state=10, verbose=1)
# clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, presort=False)
# clf= svm.SVC(kernel='rbf')
#clf= neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))


Training model...
Predicting...

Confusion matrix:
 [[ 54638   1362]
 [ 14794 104547]]
Computing performance metrics

Accuracy result: 0.9078595422633611

Classification report:
             precision    recall  f1-score   support

          0       0.79      0.98      0.87     56000
          1       0.99      0.88      0.93    119341

avg / total       0.92      0.91      0.91    175341



In [77]:
# Baseline, 2 class predict
y_pred = clf.predict(X_test)

In [78]:
y_train= selected_data['attack_cat']
y_test= test_data['attack_cat']
print('features shape is:', y_train.shape, 'labels shape is:', y_test.shape)
y_train.head(10)


features shape is: (82332,) labels shape is: (175341,)


id
1     Normal
2     Normal
3     Normal
4     Normal
5     Normal
6     Normal
7     Normal
8     Normal
9     Normal
10    Normal
Name: attack_cat, dtype: object

In [79]:
# Training, choose model by commenting/uncommenting clf= Random Forest
print('Training model...')
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)
#, max_features=0.8, min_samples_leaf=3, n_estimators=500, min_samples_split=3, random_state=10, verbose=1)
# clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, presort=False)
# clf= svm.SVC(kernel='rbf')
#clf= neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))


Training model...
Predicting...

Confusion matrix:
 [[    0     0   678   639     7    85   590     1     0     0]
 [    0    74   696   743    36    67   117     7     6     0]
 [    0     1  4881  5672   177   595   827    47    64     0]
 [    0     0  6313 23457   464   879  1863   349    65     3]
 [    0     0   696   856  2257    98 14218    21    38     0]
 [    0     0   175   377    29 39315    99     2     2     1]
 [    0     0    12   373   396     1 55169    39    10     0]
 [    0     2   790  1723    55   100   369  7401    51     0]
 [    0     0    32   149    37     8   385    28   494     0]
 [    0     0     0    94     1     2    15     0     0    18]]
Computing performance metrics

Accuracy result: 0.7588983751661049

Classification report:
                precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00      2000
      Backdoor       0.96      0.04      0.08      1746
           DoS       0.34      0.40      0.37     12264
 

  'precision', 'predicted', average, warn_for)


In [80]:
# Baseline, 10 class predict
y_pred = clf.predict(X_test)

In [81]:
# Select Utra-short features and labels 
X_train= selected_data[Ultra_short_feature_list]
X_test= test_data[Ultra_short_feature_list]

In [82]:
y_train= selected_data['label']
y_test= test_data['label']
print('features shape is:', y_train.shape, 'labels shape is:', y_test.shape)
y_train.head(10)

features shape is: (82332,) labels shape is: (175341,)


id
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
Name: label, dtype: int64

In [83]:
# Training, choose model by commenting/uncommenting clf= Random Forest
print('Training model...')
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))


Training model...
Predicting...

Confusion matrix:
 [[ 52229   3771]
 [ 11063 108278]]
Computing performance metrics

Accuracy result: 0.915399136539657

Classification report:
             precision    recall  f1-score   support

          0       0.83      0.93      0.88     56000
          1       0.97      0.91      0.94    119341

avg / total       0.92      0.92      0.92    175341



In [84]:
# Ultra short 9, 2 class predict
y_pred = clf.predict(X_test)

In [85]:
y_train= selected_data['attack_cat']
y_test= test_data['attack_cat']
print('features shape is:', y_train.shape, 'labels shape is:', y_test.shape)
y_train.head(10)


features shape is: (82332,) labels shape is: (175341,)


id
1     Normal
2     Normal
3     Normal
4     Normal
5     Normal
6     Normal
7     Normal
8     Normal
9     Normal
10    Normal
Name: attack_cat, dtype: object

In [86]:
# Training, choose model by commenting/uncommenting clf= Random Forest
print('Training model...')
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))


Training model...
Predicting...

Confusion matrix:
 [[    0     0   462    51    10     7  1469     1     0     0]
 [    0   116   492   132    41    33   923     5     4     0]
 [    0     3  4002  1300   309   157  6447    37     9     0]
 [    0     2  4691 17318   834   266  9957   285    27    13]
 [    0     0   702   489  5862    63 10978    38    48     4]
 [    0     1   132   221    46 39328   266     3     2     1]
 [    0     2    93   384  1428    23 53939   116    15     0]
 [    0     2   635   782   102    25  1525  7415     5     0]
 [    0     0    36   123   120    10   698    29   117     0]
 [    0     0     2    61    13    11    20     4     0    19]]
Computing performance metrics

Accuracy result: 0.7306676704250574

Classification report:
                precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00      2000
      Backdoor       0.92      0.07      0.12      1746
           DoS       0.36      0.33      0.34     12264
 

  'precision', 'predicted', average, warn_for)


In [88]:
# Ultra short 9, 10 class predict
y_pred = clf.predict(X_test)