In [1]:
# import libraries
#import boto3, re, sys, math, json, os, sagemaker, 
import urllib.request
import numpy as np                                
import pandas as pd  
import matplotlib
import matplotlib.pyplot as plt  
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime       
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import neighbors
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.mixture import GaussianMixture
#from sagemaker.predictor import csv_serializer  
#from sagemaker import get_execution_role

# Global variables
Data_Download_Completed = True
Use_Small_DataSet = False
Small_Data_Already_Split = True
Dominate_Component_Number = 10
Feature_Weight_Step_Threshold = 0.01

  from numpy.core.umath_tests import inner1d


In [60]:
# Grab the necessary dataset files from the origin sources
data_dir="/home/will/Desktop/CSC8515/"
unsw_nb15_mainurl = 'https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/'
unsw_nb15_training_file = 'UNSW_NB15_training-set.csv'
unsw_nb15_feature_file = 'NUSW-NB15_features.csv' # misspelling in the origin data file name
unsw_nb15_test_file = 'UNSW_NB15_testing-set.csv'
download_training_url = unsw_nb15_mainurl + unsw_nb15_training_file
download_feature_url = unsw_nb15_mainurl + unsw_nb15_feature_file
download_test_url = unsw_nb15_mainurl + unsw_nb15_test_file

if(Data_Download_Completed !=True):
    try:
      urllib.request.urlretrieve (download_feature_url, unsw_nb15_feature_file)
      print('Success: ' + unsw_nb15_feature_file)
    except Exception as e:
      print('Data load error: ',e)

    try:
      urllib.request.urlretrieve (download_training_url, unsw_nb15_training_file)
      print('Success: ' + unsw_nb15_training_file)
    except Exception as e:
      print('Data load error: ',e)


In [61]:
# Load data
# Must declare data_dir as the directory of training and test files
raw_data_filename = data_dir + unsw_nb15_training_file
if(Use_Small_DataSet !=True):
    print('Loading raw data')
    try:
        raw_data = pd.read_csv(raw_data_filename, index_col=0, header=0)
        print('Success: Data loaded into dataframe.')
    except Exception as e:
        print('Data load error: ',e)
    selected_data = raw_data
    print(selected_data.shape)


Loading raw data
Success: Data loaded into dataframe.
(82332, 44)


In [62]:

# Split smaller dataset for experimentation
samll_filename = data_dir + 'UNSW_NB15_training-small.csv'
if((Use_Small_DataSet ==True) and (Small_Data_Already_Split !=True)):
    remaining_data, selected_data = np.split(raw_data.sample(frac=1, random_state=1729), [int(0.80 * len(raw_data))])
    print(remaining_data.shape, selected_data.shape)
    selected_data.head(10)
    # Write the samll data set to disk
    selected_data.to_csv(samll_filename)

In [63]:
# Read back the small data
if(Use_Small_DataSet ==True):
    try:
        small_data = pd.read_csv(samll_filename, index_col=0, header=0)
        print('Success: Data loaded into dataframe.')
    except Exception as e:
        print('Data load error: ',e)
    selected_data = small_data
    print(selected_data.shape)


In [64]:
    selected_data.head()

Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
4,6e-06,udp,-,INT,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [65]:
# Now gather the list of feature names
print('Loading feature list')
try:
    feature_data = pd.read_csv(raw_data_filename, index_col=0, header=None, nrows=1)
    print('Success: Feature list loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)
#print(feature_data)
feature_list = feature_data.values[0]
feature_only_list = np.delete(feature_list, [(feature_list.shape[0]-2), (feature_list.shape[0]-1)])
print(feature_only_list.shape)
feature_only_list

Loading feature list
Success: Feature list loaded into dataframe.
(42,)


array(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss',
       'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb',
       'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean',
       'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl',
       'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports'], dtype=object)

In [66]:
# pd.crosstab(selected_data['attack_cat'], selected_data['service'], margins=True)

In [67]:
print('Transforming data')
# Factorize columns: "proto", "service", "state", "attack_cat"
selected_data['proto'], protocols = pd.factorize(selected_data['proto'])
selected_data['service'], services = pd.factorize(selected_data['service'])
selected_data['state'], states    = pd.factorize(selected_data['state'])
#selected_data['attack_cat'], attacks = pd.factorize(selected_data['attack_cat'])
selected_data.head()

Transforming data


Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.1e-05,0,0,0,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
2,8e-06,0,0,0,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
3,5e-06,0,0,0,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
4,6e-06,0,0,0,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
5,1e-05,0,0,0,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [68]:
selected_data.describe()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
count,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,...,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0
mean,1.006756,8.811216,1.901739,0.862046,18.666472,17.545936,7993.908,13233.79,82410.89,180.967667,...,4.928898,3.663011,7.45636,0.008284,0.008381,0.129743,6.46836,9.164262,0.011126,0.5506
std,4.710444,25.348181,2.642982,1.087298,133.916353,115.574086,171642.3,151471.5,148620.4,101.513358,...,8.389545,5.915386,11.415191,0.091171,0.092485,0.638683,8.543927,11.121413,0.104891,0.497436
min,0.0,0.0,0.0,0.0,1.0,0.0,24.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,8e-06,0.0,0.0,0.0,2.0,0.0,114.0,0.0,28.60611,62.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0
50%,0.014138,2.0,0.0,1.0,6.0,2.0,534.0,178.0,2650.177,254.0,...,1.0,1.0,3.0,0.0,0.0,0.0,3.0,5.0,0.0,1.0
75%,0.71936,2.0,6.0,1.0,12.0,10.0,1280.0,956.0,111111.1,254.0,...,4.0,3.0,6.0,0.0,0.0,0.0,7.0,11.0,0.0,1.0
max,59.999989,130.0,12.0,6.0,10646.0,11018.0,14355770.0,14657530.0,1000000.0,255.0,...,59.0,38.0,63.0,2.0,2.0,16.0,60.0,62.0,1.0,1.0


In [69]:
# separate features (columns 1..42) and labels (column 43, 44)
features= pd.DataFrame(selected_data.iloc[:,:selected_data.shape[1]-2])
#labels= selected_data.iloc[:,selected_data.shape[1]-1:]
attack_labels=pd.DataFrame(selected_data['attack_cat'])
print('features shape is:', features.shape, 'labels shape is:', attack_labels.shape)
attack_labels.head(10)
df_labels= pd.DataFrame(selected_data.iloc[:,selected_data.shape[1]-2:])


features shape is: (82332, 42) labels shape is: (82332, 1)


In [70]:
# Study the training data
labels_array= attack_labels.values # this becomes a 'horizontal' array
#print(labels)
UNSW_NB15_Label_names2 = np.unique(labels_array, return_counts=True)
#print(UNSW_NB15_Label_names2)
UNSW_NB15_Label_names = {}
for i in range(len(UNSW_NB15_Label_names2[0])):
    UNSW_NB15_Label_names[UNSW_NB15_Label_names2[0][i]] = UNSW_NB15_Label_names2[1][i]
# ordered_UNSW_NB15_Label_names = sorted(UNSW_NB15_Label_names.items(), key=lambda x: x[1], reverse=True)
ordered_UNSW_NB15_Label_names = sorted(UNSW_NB15_Label_names, key=UNSW_NB15_Label_names.__getitem__, reverse=True)
print(ordered_UNSW_NB15_Label_names)
#UNSW_NB15_Label_names = np.sort(UNSW_NB15_Label_names, axis=0, order=Count)
print('Attack Category', '\t\t\t Counts')
short_cat_name = np.copy(ordered_UNSW_NB15_Label_names)
for i, cat in enumerate(short_cat_name):
    if(cat == 'Reconnaissance'):
        short_cat_name[i] = 'Reconn'
for i, cat in enumerate(short_cat_name):
    if(len(cat) < 6):
        print('\t', cat, '\t\t\t\t', UNSW_NB15_Label_names[ordered_UNSW_NB15_Label_names[i]])
    else:
        print('\t', cat, '\t\t\t', UNSW_NB15_Label_names[ordered_UNSW_NB15_Label_names[i]])
print('Total # of Data Points: \t\t', attack_labels.shape[0])

    

['Normal', 'Generic', 'Exploits', 'Fuzzers', 'DoS', 'Reconnaissance', 'Analysis', 'Backdoor', 'Shellcode', 'Worms']
Attack Category 			 Counts
	 Normal 			 37000
	 Generic 			 18871
	 Exploits 			 11132
	 Fuzzers 			 6062
	 DoS 				 4089
	 Reconn 			 3496
	 Analysis 			 677
	 Backdoor 			 583
	 Shellcode 			 378
	 Worms 				 44
Total # of Data Points: 		 82332


In [71]:
#normalized_features = features.sub(features.mean(axis=0), axis=1)
normalized_features = features
normalized_features = normalized_features.sub(normalized_features.mean(axis=0), axis=1)
normalized_features = normalized_features.divide(normalized_features.std(axis=0), axis=1)
normalized_features.describe()


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
count,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,...,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0
mean,3.476857e-15,7.703188e-14,-2.705665e-13,6.04764e-14,1.072367e-14,1.193902e-14,-2.574517e-15,-2.99001e-15,4.654311e-14,1.036637e-13,...,-5.034774e-14,-1.512094e-13,3.029995e-13,5.44592e-14,-2.75541e-15,-2.002353e-14,3.314717e-15,4.094806e-14,-1.086325e-14,-7.248588e-14
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.2137285,-0.3476074,-0.7195431,-0.7928335,-0.1319217,-0.1518155,-0.04643325,-0.08736818,-0.554506,-1.782698,...,-0.5636564,-0.4683088,-0.4501838,-0.5655936,-0.09085693,-0.09061681,-0.2031416,-0.6400289,-0.7341029,-0.1060694
25%,-0.2137268,-0.3476074,-0.7195431,-0.7928335,-0.1244543,-0.1518155,-0.0459089,-0.08736818,-0.5543135,-1.171941,...,-0.5636564,-0.4683088,-0.4501838,-0.5655936,-0.09085693,-0.09061681,-0.2031416,-0.6400289,-0.6441863,-0.1060694
50%,-0.2107271,-0.2687063,-0.7195431,0.1268775,-0.09458496,-0.1345106,-0.04346195,-0.08619304,-0.5366742,0.7194357,...,-0.444865,-0.4683088,-0.4501838,-0.3903885,-0.09085693,-0.09061681,-0.2031416,-0.4059445,-0.3744364,-0.1060694
75%,-0.06101248,-0.2687063,1.55062,0.1268775,-0.04978087,-0.0652909,-0.03911571,-0.08105676,0.193111,0.7194357,...,0.03030098,-0.1107209,-0.1120824,-0.1275808,-0.09085693,-0.09061681,-0.2031416,0.06222434,0.1650634,-0.1060694
max,12.52392,4.780966,3.820783,4.725432,79.358,95.18097,83.59119,96.68024,6.174047,0.7292866,...,6.32625,6.445058,5.804691,4.865765,21.8459,21.53449,24.84843,6.265461,4.750811,9.427673


In [72]:
normalized_data = normalized_features.merge(df_labels, how='inner', left_on='id', right_on='id')
normalized_data.head()

Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.213726,-0.347607,-0.719543,-0.792833,-0.124454,-0.151815,-0.043683,-0.087368,0.057181,0.719436,...,-0.450184,-0.477991,-0.090857,-0.090617,-0.203142,-0.640029,-0.644186,-0.106069,Normal,0
2,-0.213727,-0.347607,-0.719543,-0.792833,-0.124454,-0.151815,-0.036308,-0.087368,0.286563,0.719436,...,-0.450184,-0.477991,-0.090857,-0.090617,-0.203142,-0.640029,-0.644186,-0.106069,Normal,0
3,-0.213727,-0.347607,-0.719543,-0.792833,-0.124454,-0.151815,-0.040351,-0.087368,0.791205,0.719436,...,-0.450184,-0.390389,-0.090857,-0.090617,-0.203142,-0.640029,-0.55427,-0.106069,Normal,0
4,-0.213727,-0.347607,-0.719543,-0.792833,-0.124454,-0.151815,-0.04133,-0.087368,0.566919,0.719436,...,-0.450184,-0.390389,-0.090857,-0.090617,-0.203142,-0.522987,-0.55427,-0.106069,Normal,0
5,-0.213726,-0.347607,-0.719543,-0.792833,-0.124454,-0.151815,-0.034187,-0.087368,0.118349,0.719436,...,-0.450184,-0.390389,-0.090857,-0.090617,-0.203142,-0.522987,-0.55427,-0.106069,Normal,0


In [73]:
# This function will add up composite weight for all original features: sum(PCA_component * component weight)
# Then sort and return all features that have a big enough step threshold compared to the next smaller weight 
def get_important_features(weighted_comp, Comp_Num, feature_name_list):
    weighted_features = {}
    top_sorted_feature_weight = {}
    for i in range(len(feature_name_list)):
        weighted_features[feature_name_list[i]] = weighted_comp[i]
        
    sorted_features = sorted(weighted_features, key=weighted_features.__getitem__, reverse=True)
    
    for k in sorted_features[0:Comp_Num]:
        last_weight = 1
        if(weighted_features[k] > (Feature_Weight_Step_Threshold*last_weight)):
            # print("{} : {}".format(k, weighted_features[k]))
            top_sorted_feature_weight[k] = weighted_features[k]
            last_weight = weighted_features[k]
    return(top_sorted_feature_weight)


In [74]:
# This function takes a dataset and use PCA to find the most dominant features
def PCA_feature_finder(features, labels, feature_name_list):
    # set up a PCA learner
    pca = PCA(n_components = Dominate_Component_Number)
    # actually run the fit algorithm
    fit = pca.fit(features)
    # transform our data using the learned transform
    reduced_feature_data = fit.transform(features)
    
    weighted_comp = []
    for i in range(len(feature_name_list)):
        weighted_comp.append(0)
    for i, weight in enumerate(pca.explained_variance_ratio_):
        weighted_comp += pca.components_[i] * weight
        
    return(get_important_features(weighted_comp, Dominate_Component_Number, feature_name_list))

In [75]:
# This function takes a dataset and use univariate statistical tests to find the most dominant features
# Mutual Information test is used in statistics to test the independence of two events.
def UniV_feature_finder(features, lables, feature_name_list):
    # set up a univerate learner
    univ = SelectKBest(score_func=mutual_info_classif, k=Dominate_Component_Number)
    # actually run the fit algorithm
    fit = univ.fit(features, lables)
    # transform our data using the learned transform
    reduced_feature_data = fit.transform(features)
        
    return(get_important_features(fit.scores_, Dominate_Component_Number, feature_name_list))


In [76]:
# This function will get reduced features from RFE support output of array of True, False 
def get_rfe_features(true_false_array, Comp_Num, feature_name_list):
    weighted_features = {}
    top_sorted_feature_weight = {}
    for i in range(len(feature_name_list)):
        if(true_false_array[i] == True):
            weighted_features[feature_name_list[i]] = 1
        else:
            weighted_features[feature_name_list[i]] = 0
        
    sorted_features = sorted(weighted_features, key=weighted_features.__getitem__, reverse=True)
    
    for k in sorted_features[0:Comp_Num]:
        last_weight = 1
        if(weighted_features[k] > (Feature_Weight_Step_Threshold*last_weight)):
            # print("{} : {}".format(k, weighted_features[k]))
            top_sorted_feature_weight[k] = weighted_features[k]
            last_weight = weighted_features[k]
    return(top_sorted_feature_weight)


In [77]:
# This function takes a dataset and use Recursive Feature Elimination to find the most dominant features
def RFE_feature_finder(features, labels, feature_name_list):
    # set up a RFE learner
    model = LogisticRegression()
    rfe = RFE(estimator=model, n_features_to_select=Dominate_Component_Number, step=1)
    # actually run the fit algorithm
    fit = rfe.fit(features, labels)
    # transform our data using the learned transform
    reduced_feature_data = fit.transform(features)
        
    return(get_rfe_features(fit.support_, Dominate_Component_Number, feature_name_list))

In [78]:
# This will calculate the combined RFE selection results
print("RFE Feature selction results:")
total_top_features = {}
for val in ordered_UNSW_NB15_Label_names:
    current_top_features = {}
    if (val != 'Normal'):
        binary_criteria = ['Normal', val]
        print('\nFor attack_cat of', val, 'the dominant features are:')
        binary_data = normalized_data_RFE[normalized_data.attack_cat.isin(binary_criteria)]
        # binary_data.head(10)

        # separate features (columns 1..42) and labels (column 43, 44)
        binary_features= binary_data.iloc[:,:binary_data.shape[1]-2]
        binary_attack_labels=binary_data['attack_cat']
        # print('features shape is:', binary_features.shape, 'labels shape is:', binary_attack_labels.shape)

        current_top_features = RFE_feature_finder(binary_features, binary_attack_labels, feature_only_list)
        total_top_features.update(current_top_features)
        # print("RFE Feature selction results:")
        print(current_top_features)
        
sorted_top_features = sorted(total_top_features, key=total_top_features.__getitem__, reverse=True)
print("\n\nCumulative Univariate Selection Results are sorted bleow: ")
print(sorted_top_features)

RFE Feature selction results:

For attack_cat of Generic the dominant features are:


  if __name__ == '__main__':


{'state': 1, 'dttl': 1, 'dload': 1, 'swin': 1, 'dwin': 1, 'synack': 1, 'ct_state_ttl': 1, 'ct_dst_sport_ltm': 1, 'ct_dst_src_ltm': 1, 'ct_srv_dst': 1}

For attack_cat of Exploits the dominant features are:


  if __name__ == '__main__':


{'proto': 1, 'dbytes': 1, 'dttl': 1, 'dload': 1, 'dloss': 1, 'swin': 1, 'synack': 1, 'ct_src_dport_ltm': 1, 'ct_dst_sport_ltm': 1, 'ct_srv_dst': 1}

For attack_cat of Fuzzers the dominant features are:


  if __name__ == '__main__':


{'spkts': 1, 'sbytes': 1, 'sttl': 1, 'dttl': 1, 'swin': 1, 'ct_srv_src': 1, 'ct_src_dport_ltm': 1, 'ct_dst_sport_ltm': 1, 'ct_dst_src_ltm': 1, 'ct_srv_dst': 1}

For attack_cat of DoS the dominant features are:


  if __name__ == '__main__':


{'proto': 1, 'dttl': 1, 'dload': 1, 'sjit': 1, 'swin': 1, 'synack': 1, 'dmean': 1, 'ct_state_ttl': 1, 'ct_dst_sport_ltm': 1, 'ct_dst_src_ltm': 1}

For attack_cat of Reconnaissance the dominant features are:


  if __name__ == '__main__':


{'sttl': 1, 'dttl': 1, 'sloss': 1, 'swin': 1, 'synack': 1, 'smean': 1, 'trans_depth': 1, 'ct_dst_sport_ltm': 1, 'ct_dst_src_ltm': 1, 'ct_srv_dst': 1}

For attack_cat of Analysis the dominant features are:


  if __name__ == '__main__':


{'proto': 1, 'state': 1, 'dttl': 1, 'swin': 1, 'dwin': 1, 'ct_srv_src': 1, 'ct_state_ttl': 1, 'ct_dst_sport_ltm': 1, 'ct_dst_src_ltm': 1, 'ct_srv_dst': 1}

For attack_cat of Backdoor the dominant features are:


  if __name__ == '__main__':


{'proto': 1, 'state': 1, 'sttl': 1, 'swin': 1, 'synack': 1, 'smean': 1, 'ct_srv_src': 1, 'ct_dst_sport_ltm': 1, 'ct_dst_src_ltm': 1, 'ct_srv_dst': 1}

For attack_cat of Shellcode the dominant features are:


  if __name__ == '__main__':


{'dur': 1, 'sttl': 1, 'dttl': 1, 'swin': 1, 'synack': 1, 'ct_srv_src': 1, 'ct_src_dport_ltm': 1, 'ct_dst_sport_ltm': 1, 'ct_dst_src_ltm': 1, 'ct_srv_dst': 1}

For attack_cat of Worms the dominant features are:


  if __name__ == '__main__':


{'proto': 1, 'sttl': 1, 'dload': 1, 'swin': 1, 'smean': 1, 'trans_depth': 1, 'ct_src_dport_ltm': 1, 'ct_dst_sport_ltm': 1, 'ct_dst_src_ltm': 1, 'ct_srv_dst': 1}


Cumulative Univariate Selection Results are sorted bleow: 
['state', 'dttl', 'dload', 'swin', 'dwin', 'synack', 'ct_state_ttl', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_srv_dst', 'proto', 'dbytes', 'dloss', 'ct_src_dport_ltm', 'spkts', 'sbytes', 'sttl', 'ct_srv_src', 'sjit', 'dmean', 'sloss', 'smean', 'trans_depth', 'dur']


In [79]:
# Normalize the dataset
# z_scaler = StandardScaler()
# normalized_features = z_scaler.fit_transform(features)
# normalized_features[0:3]

In [80]:
sorted_top_features

['state',
 'dttl',
 'dload',
 'swin',
 'dwin',
 'synack',
 'ct_state_ttl',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'ct_srv_dst',
 'proto',
 'dbytes',
 'dloss',
 'ct_src_dport_ltm',
 'spkts',
 'sbytes',
 'sttl',
 'ct_srv_src',
 'sjit',
 'dmean',
 'sloss',
 'smean',
 'trans_depth',
 'dur']

In [81]:
# Select RFE based features and labels (column 43)
train_features= normalized_data[sorted_top_features]
#labels= selected_data.iloc[:,selected_data.shape[1]-1:]
attack_labels=normalized_data['attack_cat']
print('features shape is:', train_features.shape, 'labels shape is:', attack_labels.shape)
attack_labels.head(10)

features shape is: (82332, 24) labels shape is: (82332,)


id
1     Normal
2     Normal
3     Normal
4     Normal
5     Normal
6     Normal
7     Normal
8     Normal
9     Normal
10    Normal
Name: attack_cat, dtype: object

In [82]:
# Separate data in train set and test set
df= pd.DataFrame(train_features)
# create training and testing vars
# Note: train_size + test_size < 1.0 means we are subsampling
# Use small numbers for slow classifiers, as KNN, Radius, SVC,...
X_train, X_test, y_train, y_test = train_test_split(df, attack_labels, train_size=0.8, test_size=0.2, random_state=1)
print('X_train, y_train:', X_train.shape, y_train.shape)
print('X_test, y_test:', X_test.shape, y_test.shape)

X_train, y_train: (65865, 24) (65865,)
X_test, y_test: (16467, 24) (16467,)


In [83]:
# Function to plot confusion matrix
# https://gist.github.com/zachguo/10296432
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [1])  # 5 is value length
    empty_cell = " " * columnwidth
    empty_cell0 = " " * (columnwidth-1) + "-"
    
    # Begin CHANGES
    fst_empty_cell = (columnwidth-3)//2 * " " + "t/p" + (columnwidth-3)//2 * " "
    
    if len(fst_empty_cell) < len(empty_cell):
        fst_empty_cell = " " * (len(empty_cell) - len(fst_empty_cell)) + fst_empty_cell
    # Print header
    print("    " + fst_empty_cell, end=" ")
    # End CHANGES
    
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
        
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell0
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print(cell, end=" ")
        print()
        

In [84]:
# Training, choose model by commenting/uncommenting clf=
print('Training model...')
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)
#, max_features=0.8, min_samples_leaf=3, n_estimators=500, min_samples_split=3, random_state=10, verbose=1)
# clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, presort=False)
# clf= svm.SVC(kernel='rbf')
#clf= neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred, ordered_UNSW_NB15_Label_names)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred,ordered_UNSW_NB15_Label_names))


Training model...
Predicting...

Confusion matrix:
 [[7250    3   27   45    2    4    0    0    7    0]
 [  11 3768   49    3   21    0    0    0    1    0]
 [  59    8 1577  156  375   41    0    1    9    0]
 [ 151    0  155  876   45    1    0    1    2    0]
 [  20    4  363   61  357    7    0    0    7    0]
 [   8    1   68    5   49  554    0    1    1    0]
 [   0    0   49   38   20    0   13    0    0    0]
 [   2    1   59   47    6    0    0    2    1    0]
 [  10    1   10    5    5    1    0    0   34    0]
 [   0    0    7    1    1    0    0    0    0    0]]
Computing performance metrics

Accuracy result: 0.8763587781623854

Classification report:
                precision    recall  f1-score   support

        Normal       0.97      0.99      0.98      7338
       Generic       1.00      0.98      0.99      3853
      Exploits       0.67      0.71      0.69      2226
       Fuzzers       0.71      0.71      0.71      1231
           DoS       0.41      0.44      0.42

  'precision', 'predicted', average, warn_for)


In [85]:
print_cm(results, short_cat_name, True)

       t/p       Normal   Generic  Exploits   Fuzzers       DoS    Reconn  Analysis  Backdoor Shellcode     Worms 
       Normal    7250.0       3.0      27.0      45.0       2.0       4.0         -         -       7.0         - 
      Generic      11.0    3768.0      49.0       3.0      21.0         -         -         -       1.0         - 
     Exploits      59.0       8.0    1577.0     156.0     375.0      41.0         -       1.0       9.0         - 
      Fuzzers     151.0         -     155.0     876.0      45.0       1.0         -       1.0       2.0         - 
          DoS      20.0       4.0     363.0      61.0     357.0       7.0         -         -       7.0         - 
       Reconn       8.0       1.0      68.0       5.0      49.0     554.0         -       1.0       1.0         - 
     Analysis         -         -      49.0      38.0      20.0         -      13.0         -         -         - 
     Backdoor       2.0       1.0      59.0      47.0       6.0         -       

In [86]:
sorted(sorted_top_features)

['ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'ct_src_dport_ltm',
 'ct_srv_dst',
 'ct_srv_src',
 'ct_state_ttl',
 'dbytes',
 'dload',
 'dloss',
 'dmean',
 'dttl',
 'dur',
 'dwin',
 'proto',
 'sbytes',
 'sjit',
 'sloss',
 'smean',
 'spkts',
 'state',
 'sttl',
 'swin',
 'synack',
 'trans_depth']

In [87]:
# Test: this is the list of common features in all 3 selection methods
ultra_3_short_features = ['ct_state_ttl', 'dbytes', 'dmean', 'dttl', 'dur', 'sbytes', 'state', 'sttl', 'synack']

In [88]:
# Select ultra_3_short based features and labels (column 43)
train_features= normalized_data[ultra_3_short_features]
#labels= selected_data.iloc[:,selected_data.shape[1]-1:]
attack_labels=normalized_data['attack_cat']
print('features shape is:', train_features.shape, 'labels shape is:', attack_labels.shape)
attack_labels.head(10)

features shape is: (82332, 9) labels shape is: (82332,)


id
1     Normal
2     Normal
3     Normal
4     Normal
5     Normal
6     Normal
7     Normal
8     Normal
9     Normal
10    Normal
Name: attack_cat, dtype: object

In [89]:
# Separate data in train set and test set
df= pd.DataFrame(train_features)
# create training and testing vars
# Note: train_size + test_size < 1.0 means we are subsampling
# Use small numbers for slow classifiers, as KNN, Radius, SVC,...
X_train, X_test, y_train, y_test = train_test_split(df, attack_labels, train_size=0.8, test_size=0.2, random_state=1)
print('X_train, y_train:', X_train.shape, y_train.shape)
print('X_test, y_test:', X_test.shape, y_test.shape)

X_train, y_train: (65865, 9) (65865,)
X_test, y_test: (16467, 9) (16467,)


In [90]:
# Training, choose model by commenting/uncommenting clf=
print('Training model...')
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)
#, max_features=0.8, min_samples_leaf=3, n_estimators=500, min_samples_split=3, random_state=10, verbose=1)
# clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, presort=False)
# clf= svm.SVC(kernel='rbf')
#clf= neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred, ordered_UNSW_NB15_Label_names)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred,ordered_UNSW_NB15_Label_names))


Training model...
Predicting...

Confusion matrix:
 [[7029    7   51  226    4    7    0    0   13    1]
 [  18 3744   56   11   21    2    0    0    1    0]
 [ 125   11 1710   69  271   37    0    2    1    0]
 [ 495    7  194  475   46    4    0    1    9    0]
 [  39   13  393   30  328   12    0    1    3    0]
 [  23    0   85    4   32  542    0    1    0    0]
 [  13    0   86    9   12    0    0    0    0    0]
 [   1    0   89   10   11    0    0    7    0    0]
 [  31    1   10   13    0    2    0    0    9    0]
 [   4    0    3    1    1    0    0    0    0    0]]
Computing performance metrics

Accuracy result: 0.8407117264832695

Classification report:
                precision    recall  f1-score   support

        Normal       0.90      0.96      0.93      7338
       Generic       0.99      0.97      0.98      3853
      Exploits       0.64      0.77      0.70      2226
       Fuzzers       0.56      0.39      0.46      1231
           DoS       0.45      0.40      0.42

  'precision', 'predicted', average, warn_for)
