In [1]:
# import libraries
#import boto3, re, sys, math, json, os, sagemaker, 
import urllib.request
import numpy as np                                
import pandas as pd  
import matplotlib
import matplotlib.pyplot as plt  
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime       
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import neighbors
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
#from sagemaker.predictor import csv_serializer  
#from sagemaker import get_execution_role

# Global variables
Data_Download_Completed = True
Use_Small_DataSet = False
Small_Data_Already_Split = True
Dominate_Component_Number = 10
Feature_Weight_Step_Threshold = 0.01


  from numpy.core.umath_tests import inner1d


In [2]:
# Grab the necessary dataset files from the origin sources
data_dir="/home/will/Desktop/CSC8515/"
unsw_nb15_mainurl = 'https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/'
unsw_nb15_training_file = 'UNSW_NB15_training-set.csv'
unsw_nb15_feature_file = 'NUSW-NB15_features.csv' # misspelling in the origin data file name
unsw_nb15_test_file = 'UNSW_NB15_testing-set.csv'
download_training_url = unsw_nb15_mainurl + unsw_nb15_training_file
download_feature_url = unsw_nb15_mainurl + unsw_nb15_feature_file
download_test_url = unsw_nb15_mainurl + unsw_nb15_test_file

if(Data_Download_Completed !=True):
    try:
      urllib.request.urlretrieve (download_feature_url, unsw_nb15_feature_file)
      print('Success: ' + unsw_nb15_feature_file)
    except Exception as e:
      print('Data load error: ',e)

    try:
      urllib.request.urlretrieve (download_training_url, unsw_nb15_training_file)
      print('Success: ' + unsw_nb15_training_file)
    except Exception as e:
      print('Data load error: ',e)


In [3]:
# Load data
# Must declare data_dir as the directory of training and test files
raw_data_filename = data_dir + unsw_nb15_training_file
if(Use_Small_DataSet !=True):
    print('Loading raw data')
    try:
        raw_data = pd.read_csv(raw_data_filename, index_col=0, header=0)
        print('Success: Data loaded into dataframe.')
    except Exception as e:
        print('Data load error: ',e)
    selected_data = raw_data
    print(selected_data.shape)


Loading raw data
Success: Data loaded into dataframe.
(82332, 44)


In [4]:

# Split smaller dataset for experimentation
samll_filename = data_dir + 'UNSW_NB15_training-small.csv'
if((Use_Small_DataSet ==True) and (Small_Data_Already_Split !=True)):
    remaining_data, selected_data = np.split(raw_data.sample(frac=1, random_state=1729), [int(0.80 * len(raw_data))])
    print(remaining_data.shape, selected_data.shape)
    selected_data.head(10)
    # Write the samll data set to disk
    selected_data.to_csv(samll_filename)

In [5]:
# Read back the small data
if(Use_Small_DataSet ==True):
    try:
        small_data = pd.read_csv(samll_filename, index_col=0, header=0)
        print('Success: Data loaded into dataframe.')
    except Exception as e:
        print('Data load error: ',e)
    selected_data = small_data
    print(selected_data.shape)


In [6]:
    selected_data.head()

Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
4,6e-06,udp,-,INT,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [7]:
# Now gather the list of feature names
print('Loading feature list')
try:
    feature_data = pd.read_csv(raw_data_filename, index_col=0, header=None, nrows=1)
    print('Success: Feature list loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)
#print(feature_data)
feature_list = feature_data.values[0]
feature_only_list = np.delete(feature_list, [(feature_list.shape[0]-2), (feature_list.shape[0]-1)])
print(feature_only_list.shape)
feature_only_list

Loading feature list
Success: Feature list loaded into dataframe.
(42,)


array(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss',
       'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb',
       'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean',
       'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl',
       'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports'], dtype=object)

In [8]:
# pd.crosstab(selected_data['attack_cat'], selected_data['service'], margins=True)

In [9]:
print('Transforming data')
# Factorize columns: "proto", "service", "state", "attack_cat"
selected_data['proto'], protocols = pd.factorize(selected_data['proto'])
selected_data['service'], services = pd.factorize(selected_data['service'])
selected_data['state'], states    = pd.factorize(selected_data['state'])
#selected_data['attack_cat'], attacks = pd.factorize(selected_data['attack_cat'])
selected_data.head()

Transforming data


Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.1e-05,0,0,0,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
2,8e-06,0,0,0,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
3,5e-06,0,0,0,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
4,6e-06,0,0,0,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
5,1e-05,0,0,0,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [10]:
#pd.crosstab(selected_data['attack_cat'], selected_data['service'], margins=True)

In [11]:
import matplotlib.pyplot as plt
%matplotlib inline
#selected_data.boxplot(column='service', by='attack_cat')

In [12]:
selected_data.describe()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
count,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,...,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0
mean,1.006756,8.811216,1.901739,0.862046,18.666472,17.545936,7993.908,13233.79,82410.89,180.967667,...,4.928898,3.663011,7.45636,0.008284,0.008381,0.129743,6.46836,9.164262,0.011126,0.5506
std,4.710444,25.348181,2.642982,1.087298,133.916353,115.574086,171642.3,151471.5,148620.4,101.513358,...,8.389545,5.915386,11.415191,0.091171,0.092485,0.638683,8.543927,11.121413,0.104891,0.497436
min,0.0,0.0,0.0,0.0,1.0,0.0,24.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,8e-06,0.0,0.0,0.0,2.0,0.0,114.0,0.0,28.60611,62.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0
50%,0.014138,2.0,0.0,1.0,6.0,2.0,534.0,178.0,2650.177,254.0,...,1.0,1.0,3.0,0.0,0.0,0.0,3.0,5.0,0.0,1.0
75%,0.71936,2.0,6.0,1.0,12.0,10.0,1280.0,956.0,111111.1,254.0,...,4.0,3.0,6.0,0.0,0.0,0.0,7.0,11.0,0.0,1.0
max,59.999989,130.0,12.0,6.0,10646.0,11018.0,14355770.0,14657530.0,1000000.0,255.0,...,59.0,38.0,63.0,2.0,2.0,16.0,60.0,62.0,1.0,1.0


In [13]:
# separate features (columns 1..42) and labels (column 43, 44)
features= pd.DataFrame(selected_data.iloc[:,:selected_data.shape[1]-2])
#labels= selected_data.iloc[:,selected_data.shape[1]-1:]
attack_labels=pd.DataFrame(selected_data['attack_cat'])
print('features shape is:', features.shape, 'labels shape is:', attack_labels.shape)
attack_labels.head(10)
df_labels= pd.DataFrame(selected_data.iloc[:,selected_data.shape[1]-2:])


features shape is: (82332, 42) labels shape is: (82332, 1)


In [14]:
# Study the training data
labels_array= attack_labels.values # this becomes a 'horizontal' array
#print(labels)
UNSW_NB15_Label_names2 = np.unique(labels_array, return_counts=True)
#print(UNSW_NB15_Label_names2)
UNSW_NB15_Label_names = {}
for i in range(len(UNSW_NB15_Label_names2[0])):
    UNSW_NB15_Label_names[UNSW_NB15_Label_names2[0][i]] = UNSW_NB15_Label_names2[1][i]
# ordered_UNSW_NB15_Label_names = sorted(UNSW_NB15_Label_names.items(), key=lambda x: x[1], reverse=True)
ordered_UNSW_NB15_Label_names = sorted(UNSW_NB15_Label_names, key=UNSW_NB15_Label_names.__getitem__, reverse=True)
print(ordered_UNSW_NB15_Label_names)
#UNSW_NB15_Label_names = np.sort(UNSW_NB15_Label_names, axis=0, order=Count)
print('Attack Category', '\t\t\t Counts')
short_cat_name = np.copy(ordered_UNSW_NB15_Label_names)
for i, cat in enumerate(short_cat_name):
    if(cat == 'Reconnaissance'):
        short_cat_name[i] = 'Reconn'
for i, cat in enumerate(short_cat_name):
    if(len(cat) < 6):
        print('\t', cat, '\t\t\t\t', UNSW_NB15_Label_names[ordered_UNSW_NB15_Label_names[i]])
    else:
        print('\t', cat, '\t\t\t', UNSW_NB15_Label_names[ordered_UNSW_NB15_Label_names[i]])
print('Total # of Data Points: \t\t', attack_labels.shape[0])

    

['Normal', 'Generic', 'Exploits', 'Fuzzers', 'DoS', 'Reconnaissance', 'Analysis', 'Backdoor', 'Shellcode', 'Worms']
Attack Category 			 Counts
	 Normal 			 37000
	 Generic 			 18871
	 Exploits 			 11132
	 Fuzzers 			 6062
	 DoS 				 4089
	 Reconn 			 3496
	 Analysis 			 677
	 Backdoor 			 583
	 Shellcode 			 378
	 Worms 				 44
Total # of Data Points: 		 82332


In [15]:
# Univariate Selection requries non-negative data
#normalized_features = features.sub(features.mean(axis=0), axis=1)
normalized_features_univ = features
normalized_features_univ = normalized_features_univ.add(normalized_features_univ.min(axis=0), axis=1)
normalized_features_univ = normalized_features_univ.divide(normalized_features_univ.max(axis=0), axis=1)
normalized_features_univ.describe()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
count,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,...,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0
mean,0.01677927,0.067779,0.158478,0.143674,0.001847,0.001592,0.000559,0.000903,0.082411,0.709677,...,0.112415,0.098815,0.119564,0.132131,0.004142,0.00419,0.008109,0.122432,0.161337,0.011126
std,0.07850742,0.194986,0.220248,0.181216,0.012578,0.01049,0.011956,0.010334,0.14862,0.398092,...,0.140302,0.139826,0.151677,0.178362,0.045586,0.046243,0.039918,0.140064,0.17653,0.104891
min,0.0,0.0,0.0,0.0,0.000188,0.0,3e-06,0.0,0.0,0.0,...,0.033333,0.033333,0.051282,0.03125,0.0,0.0,0.0,0.032787,0.031746,0.0
25%,1.333334e-07,0.0,0.0,0.0,0.000282,0.0,1e-05,0.0,2.9e-05,0.243137,...,0.033333,0.033333,0.051282,0.03125,0.0,0.0,0.0,0.032787,0.047619,0.0
50%,0.0002356334,0.015385,0.0,0.166667,0.000657,0.000182,3.9e-05,1.2e-05,0.00265,0.996078,...,0.05,0.033333,0.051282,0.0625,0.0,0.0,0.0,0.065574,0.095238,0.0
75%,0.01198934,0.015385,0.5,0.166667,0.001221,0.000908,9.1e-05,6.5e-05,0.111111,0.996078,...,0.116667,0.083333,0.102564,0.109375,0.0,0.0,0.0,0.131148,0.190476,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
normalized_data_univ = normalized_features_univ.merge(df_labels, how='inner', left_on='id', right_on='id')
normalized_data_univ.head()

Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.833334e-07,0.0,0.0,0.0,0.000282,0.0,3.6e-05,0.0,0.090909,0.996078,...,0.051282,0.046875,0.0,0.0,0.0,0.032787,0.047619,0.0,Normal,0
2,1.333334e-07,0.0,0.0,0.0,0.000282,0.0,0.000124,0.0,0.125,0.996078,...,0.051282,0.046875,0.0,0.0,0.0,0.032787,0.047619,0.0,Normal,0
3,8.333335e-08,0.0,0.0,0.0,0.000282,0.0,7.6e-05,0.0,0.2,0.996078,...,0.051282,0.0625,0.0,0.0,0.0,0.032787,0.063492,0.0,Normal,0
4,1e-07,0.0,0.0,0.0,0.000282,0.0,6.4e-05,0.0,0.166667,0.996078,...,0.051282,0.0625,0.0,0.0,0.0,0.04918,0.063492,0.0,Normal,0
5,1.666667e-07,0.0,0.0,0.0,0.000282,0.0,0.00015,0.0,0.1,0.996078,...,0.051282,0.0625,0.0,0.0,0.0,0.04918,0.063492,0.0,Normal,0


In [17]:
# normalized_data_univ.hist(column='service', by='attack_cat')

In [21]:
# This function will sort and return all features that have a big enough step threshold compared to the next smaller weight 
def get_important_features(weighted_comp, Comp_Num, feature_name_list):
    weighted_features = {}
    top_sorted_feature_weight = {}
    for i in range(len(feature_name_list)):
        weighted_features[feature_name_list[i]] = weighted_comp[i]
        
    sorted_features = sorted(weighted_features, key=weighted_features.__getitem__, reverse=True)
    
    for k in sorted_features[0:Comp_Num]:
        last_weight = 1
        if(weighted_features[k] > (Feature_Weight_Step_Threshold*last_weight)):
            # print("{} : {}".format(k, weighted_features[k]))
            top_sorted_feature_weight[k] = weighted_features[k]
            last_weight = weighted_features[k]
    return(top_sorted_feature_weight)


In [22]:
# This function takes a dataset and use univariate statistical tests to find the most dominant features
# Mutual Information test is used in statistics to test the independence of two events.
def UniV_feature_finder(features, labels, feature_name_list):
    # set up a univerate learner
    univ = SelectKBest(score_func=mutual_info_classif, k=Dominate_Component_Number)
    # actually run the fit algorithm
    fit = univ.fit(features, labels)
    # transform our data using the learned transform
    reduced_feature_data = fit.transform(features)
        
    return(get_important_features(fit.scores_, Dominate_Component_Number, feature_name_list))


In [23]:
# This will calculate the combined univariate selection results
print("Univariate Feature selction results:")
total_top_features = {}
for val in ordered_UNSW_NB15_Label_names:
    current_top_features = {}
    if (val != 'Normal'):
        binary_criteria = ['Normal', val]
        print('\nFor attack_cat of', val, 'the dominant features are:')
        binary_data = normalized_data_univ[normalized_data_univ.attack_cat.isin(binary_criteria)]
        # binary_data.head(10)

        # separate features (columns 1..42) and labels (column 43, 44)
        binary_features= binary_data.iloc[:,:binary_data.shape[1]-2]
        binary_attack_labels=binary_data['attack_cat']
        # print('features shape is:', binary_features.shape, 'labels shape is:', binary_attack_labels.shape)

        # print("PCA Feature selction results:")
        # print(PCA_feature_finder(binary_features, binary_attack_labels, feature_only_list))

        current_top_features = UniV_feature_finder(binary_features, binary_attack_labels, feature_only_list)
        total_top_features.update(current_top_features)
        # print("RFE Feature selction results:")
        print(current_top_features)
        
sorted_univ_features = sorted(total_top_features, key=total_top_features.__getitem__, reverse=True)
print("\n\nCumulative Univariate Selection Results are sorted bleow: ")
print(sorted_univ_features)

Univariate Feature selction results:

For attack_cat of Generic the dominant features are:
{'sbytes': 0.603892360942663, 'sload': 0.581698992421003, 'smean': 0.5577039811594244, 'ct_dst_sport_ltm': 0.5103841580337953, 'service': 0.4258819578048434, 'ct_src_dport_ltm': 0.4109983348589368, 'ct_state_ttl': 0.39516823520341804, 'rate': 0.39348015416325444, 'dur': 0.38929358825760185, 'state': 0.3870486453655506}

For attack_cat of Exploits the dominant features are:
{'sbytes': 0.3359513174326336, 'dbytes': 0.2415074348054187, 'smean': 0.23110143469631383, 'dmean': 0.1834216618638107, 'sload': 0.17297147655605505, 'synack': 0.16481269460553527, 'tcprtt': 0.16435911003614878, 'rate': 0.15116210401909336, 'sttl': 0.15026136866356854, 'ct_state_ttl': 0.13643717023647217}

For attack_cat of Fuzzers the dominant features are:
{'sbytes': 0.1690797710047709, 'smean': 0.13594940658861043, 'sload': 0.11814352224105762, 'dbytes': 0.1158434812470277, 'sttl': 0.11333386536994339, 'rate': 0.100591056699

In [24]:
# Normalize the dataset
# z_scaler = StandardScaler()
# normalized_features = z_scaler.fit_transform(features)
# normalized_features[0:3]

In [25]:
sorted_univ_features

['service',
 'ct_src_dport_ltm',
 'synack',
 'tcprtt',
 'sinpkt',
 'dinpkt',
 'ct_srv_dst',
 'dpkts',
 'proto',
 'ct_dst_sport_ltm',
 'dload',
 'ct_state_ttl',
 'state',
 'sbytes',
 'dttl',
 'sload',
 'smean',
 'dmean',
 'dur',
 'dbytes',
 'rate',
 'sttl']

In [26]:
# separate features and labels (column 43)
train_features= normalized_data_univ[sorted_univ_features]
#labels= selected_data.iloc[:,selected_data.shape[1]-1:]
attack_labels=normalized_data_univ['attack_cat']
print('features shape is:', train_features.shape, 'labels shape is:', attack_labels.shape)
attack_labels.head(10)

features shape is: (82332, 22) labels shape is: (82332,)


id
1     Normal
2     Normal
3     Normal
4     Normal
5     Normal
6     Normal
7     Normal
8     Normal
9     Normal
10    Normal
Name: attack_cat, dtype: object

In [27]:
# Separate data in train set and test set
df= pd.DataFrame(train_features)
# create training and testing vars
# Note: train_size + test_size < 1.0 means we are subsampling
# Use small numbers for slow classifiers, as KNN, Radius, SVC,...
X_train, X_test, y_train, y_test = train_test_split(df, attack_labels, train_size=0.8, test_size=0.2, random_state=1)
print('X_train, y_train:', X_train.shape, y_train.shape)
print('X_test, y_test:', X_test.shape, y_test.shape)

X_train, y_train: (65865, 22) (65865,)
X_test, y_test: (16467, 22) (16467,)


In [28]:
# Function to plot confusion matrix
# https://gist.github.com/zachguo/10296432
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [1])  # 5 is value length
    empty_cell = " " * columnwidth
    empty_cell0 = " " * (columnwidth-1) + "-"
    
    # Begin CHANGES
    fst_empty_cell = (columnwidth-3)//2 * " " + "t/p" + (columnwidth-3)//2 * " "
    
    if len(fst_empty_cell) < len(empty_cell):
        fst_empty_cell = " " * (len(empty_cell) - len(fst_empty_cell)) + fst_empty_cell
    # Print header
    print("    " + fst_empty_cell, end=" ")
    # End CHANGES
    
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
        
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell0
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print(cell, end=" ")
        print()
        

In [29]:
# Training, choose model by commenting/uncommenting clf=
print('Training model...')
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)
#, max_features=0.8, min_samples_leaf=3, n_estimators=500, min_samples_split=3, random_state=10, verbose=1)
# clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, presort=False)
# clf= svm.SVC(kernel='rbf')
#clf= neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred, ordered_UNSW_NB15_Label_names)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred,ordered_UNSW_NB15_Label_names))


Training model...
Predicting...

Confusion matrix:
 [[7149    2   29  145    1    2    0    0   10    0]
 [  13 3749   65    1   23    1    0    0    1    0]
 [  83    9 1572  131  384   39    1    1    6    0]
 [ 454    1  171  555   48    1    0    1    0    0]
 [  24   11  354   41  378    7    1    0    3    0]
 [   9    1   73    2   46  553    0    2    1    0]
 [   1    0   60   26   20    0   13    0    0    0]
 [   2    0   70   34    9    0    0    3    0    0]
 [  21    0    8    3    1    2    0    0   31    0]
 [   1    0    7    0    1    0    0    0    0    0]]
Computing performance metrics

Accuracy result: 0.8503674014696059

Classification report:
                precision    recall  f1-score   support

        Normal       0.92      0.97      0.95      7338
       Generic       0.99      0.97      0.98      3853
      Exploits       0.65      0.71      0.68      2226
       Fuzzers       0.59      0.45      0.51      1231
           DoS       0.41      0.46      0.44

  'precision', 'predicted', average, warn_for)


In [30]:
print_cm(results, short_cat_name, True)

       t/p       Normal   Generic  Exploits   Fuzzers       DoS    Reconn  Analysis  Backdoor Shellcode     Worms 
       Normal    7149.0       2.0      29.0     145.0       1.0       2.0         -         -      10.0         - 
      Generic      13.0    3749.0      65.0       1.0      23.0       1.0         -         -       1.0         - 
     Exploits      83.0       9.0    1572.0     131.0     384.0      39.0       1.0       1.0       6.0         - 
      Fuzzers     454.0       1.0     171.0     555.0      48.0       1.0         -       1.0         -         - 
          DoS      24.0      11.0     354.0      41.0     378.0       7.0       1.0         -       3.0         - 
       Reconn       9.0       1.0      73.0       2.0      46.0     553.0         -       2.0       1.0         - 
     Analysis       1.0         -      60.0      26.0      20.0         -      13.0         -         -         - 
     Backdoor       2.0         -      70.0      34.0       9.0         -       

In [32]:
sorted(sorted_univ_features)

['ct_dst_sport_ltm',
 'ct_src_dport_ltm',
 'ct_srv_dst',
 'ct_state_ttl',
 'dbytes',
 'dinpkt',
 'dload',
 'dmean',
 'dpkts',
 'dttl',
 'dur',
 'proto',
 'rate',
 'sbytes',
 'service',
 'sinpkt',
 'sload',
 'smean',
 'state',
 'sttl',
 'synack',
 'tcprtt']