In [416]:
# import libraries
#import boto3, re, sys, math, json, os, sagemaker, 
import urllib.request
import numpy as np                                
import pandas as pd  
import matplotlib
import matplotlib.pyplot as plt  
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime       
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import neighbors
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
#from sagemaker.predictor import csv_serializer  
#from sagemaker import get_execution_role

# Global variables
Data_Download_Completed = True
Use_Small_DataSet = False
Small_Data_Already_Split = True
Dominate_Component_Number = 10
Feature_Weight_Step_Threshold = 0.01


In [417]:
# Grab the necessary dataset files from the origin sources
data_dir="/home/will/Desktop/CSC8515/"
unsw_nb15_mainurl = 'https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/'
unsw_nb15_training_file = 'UNSW_NB15_training-set.csv'
unsw_nb15_feature_file = 'NUSW-NB15_features.csv' # misspelling in the origin data file name
unsw_nb15_test_file = 'UNSW_NB15_testing-set.csv'
download_training_url = unsw_nb15_mainurl + unsw_nb15_training_file
download_feature_url = unsw_nb15_mainurl + unsw_nb15_feature_file
download_test_url = unsw_nb15_mainurl + unsw_nb15_test_file

if(Data_Download_Completed !=True):
    try:
      urllib.request.urlretrieve (download_feature_url, unsw_nb15_feature_file)
      print('Success: ' + unsw_nb15_feature_file)
    except Exception as e:
      print('Data load error: ',e)

    try:
      urllib.request.urlretrieve (download_training_url, unsw_nb15_training_file)
      print('Success: ' + unsw_nb15_training_file)
    except Exception as e:
      print('Data load error: ',e)


In [418]:
# Load data
# Must declare data_dir as the directory of training and test files
raw_data_filename = data_dir + unsw_nb15_training_file
if(Use_Small_DataSet !=True):
    print('Loading raw data')
    try:
        raw_data = pd.read_csv(raw_data_filename, index_col=0, header=0)
        print('Success: Data loaded into dataframe.')
    except Exception as e:
        print('Data load error: ',e)
    selected_data = raw_data
    print(selected_data.shape)


Loading raw data
Success: Data loaded into dataframe.
(82332, 44)


In [419]:

# Split smaller dataset for experimentation
samll_filename = data_dir + 'UNSW_NB15_training-small.csv'
if((Use_Small_DataSet ==True) and (Small_Data_Already_Split !=True)):
    remaining_data, selected_data = np.split(raw_data.sample(frac=1, random_state=1729), [int(0.80 * len(raw_data))])
    print(remaining_data.shape, selected_data.shape)
    selected_data.head(10)
    # Write the samll data set to disk
    selected_data.to_csv(samll_filename)

In [420]:
# Read back the small data
if(Use_Small_DataSet ==True):
    try:
        small_data = pd.read_csv(samll_filename, index_col=0, header=0)
        print('Success: Data loaded into dataframe.')
    except Exception as e:
        print('Data load error: ',e)
    selected_data = small_data
    print(selected_data.shape)


In [421]:
    selected_data.head()

Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
4,6e-06,udp,-,INT,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [422]:
# Now gather the list of feature names
print('Loading feature list')
try:
    feature_data = pd.read_csv(raw_data_filename, index_col=0, header=None, nrows=1)
    print('Success: Feature list loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)
#print(feature_data)
feature_list = feature_data.values[0]
feature_only_list = np.delete(feature_list, [(feature_list.shape[0]-2), (feature_list.shape[0]-1)])
print(feature_only_list.shape)
feature_only_list

Loading feature list
Success: Feature list loaded into dataframe.
(42,)


array(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss',
       'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb',
       'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean',
       'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl',
       'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports'], dtype=object)

In [427]:
# pd.crosstab(selected_data['attack_cat'], selected_data['service'], margins=True)

In [428]:
print('Transforming data')
# Factorize columns: "proto", "service", "state", "attack_cat"
selected_data['proto'], protocols = pd.factorize(selected_data['proto'])
selected_data['service'], services = pd.factorize(selected_data['service'])
selected_data['state'], states    = pd.factorize(selected_data['state'])
#selected_data['attack_cat'], attacks = pd.factorize(selected_data['attack_cat'])
selected_data.head()

Transforming data


Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.1e-05,0,0,0,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
2,8e-06,0,0,0,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
3,5e-06,0,0,0,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
4,6e-06,0,0,0,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
5,1e-05,0,0,0,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [429]:
#pd.crosstab(selected_data['attack_cat'], selected_data['service'], margins=True)

In [430]:
import matplotlib.pyplot as plt
%matplotlib inline
#selected_data.boxplot(column='service', by='attack_cat')

In [431]:
selected_data.describe()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
count,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,...,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0
mean,1.006756,8.811216,1.901739,0.862046,18.666472,17.545936,7993.908,13233.79,82410.89,180.967667,...,4.928898,3.663011,7.45636,0.008284,0.008381,0.129743,6.46836,9.164262,0.011126,0.5506
std,4.710444,25.348181,2.642982,1.087298,133.916353,115.574086,171642.3,151471.5,148620.4,101.513358,...,8.389545,5.915386,11.415191,0.091171,0.092485,0.638683,8.543927,11.121413,0.104891,0.497436
min,0.0,0.0,0.0,0.0,1.0,0.0,24.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,8e-06,0.0,0.0,0.0,2.0,0.0,114.0,0.0,28.60611,62.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0
50%,0.014138,2.0,0.0,1.0,6.0,2.0,534.0,178.0,2650.177,254.0,...,1.0,1.0,3.0,0.0,0.0,0.0,3.0,5.0,0.0,1.0
75%,0.71936,2.0,6.0,1.0,12.0,10.0,1280.0,956.0,111111.1,254.0,...,4.0,3.0,6.0,0.0,0.0,0.0,7.0,11.0,0.0,1.0
max,59.999989,130.0,12.0,6.0,10646.0,11018.0,14355770.0,14657530.0,1000000.0,255.0,...,59.0,38.0,63.0,2.0,2.0,16.0,60.0,62.0,1.0,1.0


In [434]:
# separate features (columns 1..42) and labels (column 43, 44)
features= pd.DataFrame(selected_data.iloc[:,:selected_data.shape[1]-2])
#labels= selected_data.iloc[:,selected_data.shape[1]-1:]
attack_labels=pd.DataFrame(selected_data['attack_cat'])
print('features shape is:', features.shape, 'labels shape is:', attack_labels.shape)
attack_labels.head(10)
df_labels= pd.DataFrame(selected_data.iloc[:,selected_data.shape[1]-2:])


features shape is: (82332, 42) labels shape is: (82332, 1)


In [444]:
# Study the training data
labels_array= attack_labels.values # this becomes a 'horizontal' array
#print(labels)
UNSW_NB15_Label_names2 = np.unique(labels_array, return_counts=True)
#print(UNSW_NB15_Label_names2)
UNSW_NB15_Label_names = {}
for i in range(len(UNSW_NB15_Label_names2[0])):
    UNSW_NB15_Label_names[UNSW_NB15_Label_names2[0][i]] = UNSW_NB15_Label_names2[1][i]
# ordered_UNSW_NB15_Label_names = sorted(UNSW_NB15_Label_names.items(), key=lambda x: x[1], reverse=True)
ordered_UNSW_NB15_Label_names = sorted(UNSW_NB15_Label_names, key=UNSW_NB15_Label_names.__getitem__, reverse=True)
print(ordered_UNSW_NB15_Label_names)
#UNSW_NB15_Label_names = np.sort(UNSW_NB15_Label_names, axis=0, order=Count)
print('Attack Category', '\t\t\t Counts')
short_cat_name = np.copy(ordered_UNSW_NB15_Label_names)
for i, cat in enumerate(short_cat_name):
    if(cat == 'Reconnaissance'):
        short_cat_name[i] = 'Reconn'
for i, cat in enumerate(short_cat_name):
    if(len(cat) < 6):
        print('\t', cat, '\t\t\t\t', UNSW_NB15_Label_names[ordered_UNSW_NB15_Label_names[i]])
    else:
        print('\t', cat, '\t\t\t', UNSW_NB15_Label_names[ordered_UNSW_NB15_Label_names[i]])
print('Total # of Data Points: \t\t', attack_labels.shape[0])

    

['Normal', 'Generic', 'Exploits', 'Fuzzers', 'DoS', 'Reconnaissance', 'Analysis', 'Backdoor', 'Shellcode', 'Worms']
Attack Category 			 Counts
	 Normal 			 37000
	 Generic 			 18871
	 Exploits 			 11132
	 Fuzzers 			 6062
	 DoS 				 4089
	 Reconn 			 3496
	 Analysis 			 677
	 Backdoor 			 583
	 Shellcode 			 378
	 Worms 				 44
Total # of Data Points: 		 82332


In [436]:
# Univariate Selection requries non-negative data
#normalized_features = features.sub(features.mean(axis=0), axis=1)
normalized_features_univ = features
normalized_features_univ = normalized_features_univ.add(normalized_features_univ.min(axis=0), axis=1)
normalized_features_univ = normalized_features_univ.divide(normalized_features_univ.max(axis=0), axis=1)
normalized_features_univ.describe()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
count,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,...,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0
mean,0.01677927,0.067779,0.158478,0.143674,0.001847,0.001592,0.000559,0.000903,0.082411,0.709677,...,0.112415,0.098815,0.119564,0.132131,0.004142,0.00419,0.008109,0.122432,0.161337,0.011126
std,0.07850742,0.194986,0.220248,0.181216,0.012578,0.01049,0.011956,0.010334,0.14862,0.398092,...,0.140302,0.139826,0.151677,0.178362,0.045586,0.046243,0.039918,0.140064,0.17653,0.104891
min,0.0,0.0,0.0,0.0,0.000188,0.0,3e-06,0.0,0.0,0.0,...,0.033333,0.033333,0.051282,0.03125,0.0,0.0,0.0,0.032787,0.031746,0.0
25%,1.333334e-07,0.0,0.0,0.0,0.000282,0.0,1e-05,0.0,2.9e-05,0.243137,...,0.033333,0.033333,0.051282,0.03125,0.0,0.0,0.0,0.032787,0.047619,0.0
50%,0.0002356334,0.015385,0.0,0.166667,0.000657,0.000182,3.9e-05,1.2e-05,0.00265,0.996078,...,0.05,0.033333,0.051282,0.0625,0.0,0.0,0.0,0.065574,0.095238,0.0
75%,0.01198934,0.015385,0.5,0.166667,0.001221,0.000908,9.1e-05,6.5e-05,0.111111,0.996078,...,0.116667,0.083333,0.102564,0.109375,0.0,0.0,0.0,0.131148,0.190476,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [437]:
normalized_data_univ = normalized_features_univ.merge(df_labels, how='inner', left_on='id', right_on='id')
normalized_data_univ.head()

Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.833334e-07,0.0,0.0,0.0,0.000282,0.0,3.6e-05,0.0,0.090909,0.996078,...,0.051282,0.046875,0.0,0.0,0.0,0.032787,0.047619,0.0,Normal,0
2,1.333334e-07,0.0,0.0,0.0,0.000282,0.0,0.000124,0.0,0.125,0.996078,...,0.051282,0.046875,0.0,0.0,0.0,0.032787,0.047619,0.0,Normal,0
3,8.333335e-08,0.0,0.0,0.0,0.000282,0.0,7.6e-05,0.0,0.2,0.996078,...,0.051282,0.0625,0.0,0.0,0.0,0.032787,0.063492,0.0,Normal,0
4,1e-07,0.0,0.0,0.0,0.000282,0.0,6.4e-05,0.0,0.166667,0.996078,...,0.051282,0.0625,0.0,0.0,0.0,0.04918,0.063492,0.0,Normal,0
5,1.666667e-07,0.0,0.0,0.0,0.000282,0.0,0.00015,0.0,0.1,0.996078,...,0.051282,0.0625,0.0,0.0,0.0,0.04918,0.063492,0.0,Normal,0


In [439]:
# normalized_data_univ.hist(column='service', by='attack_cat')

In [440]:
# This function will add up composite weight for all original features: sum(PCA_component * component weight)
# Then sort and return all features that have a big enough step threshold compared to the next smaller weight 
def get_important_features(weighted_comp, Comp_Num, feature_name_list):
    weighted_features = {}
    top_sorted_feature_weight = {}
    for i in range(len(feature_name_list)):
        weighted_features[feature_name_list[i]] = weighted_comp[i]
        
    sorted_features = sorted(weighted_features, key=weighted_features.__getitem__, reverse=True)
    
    for k in sorted_features[0:Comp_Num]:
        last_weight = 1
        if(weighted_features[k] > (Feature_Weight_Step_Threshold*last_weight)):
            # print("{} : {}".format(k, weighted_features[k]))
            top_sorted_feature_weight[k] = weighted_features[k]
            last_weight = weighted_features[k]
    return(top_sorted_feature_weight)


In [441]:
# This function takes a dataset and use PCA to find the most dominant features
def PCA_feature_finder(features, labels, feature_name_list):
    # set up a PCA learner
    pca = PCA(n_components = Dominate_Component_Number)
    # actually run the fit algorithm
    fit = pca.fit(features)
    # transform our data using the learned transform
    reduced_feature_data = fit.transform(features)
    
    weighted_comp = []
    for i in range(len(feature_name_list)):
        weighted_comp.append(0)
    for i, weight in enumerate(pca.explained_variance_ratio_):
        weighted_comp += pca.components_[i] * weight
        
    return(get_important_features(weighted_comp, Dominate_Component_Number, feature_name_list))

In [442]:
# This function takes a dataset and use univariate statistical tests to find the most dominant features
# Chi Square Test is used in statistics to test the independence of two events.
def UniV_feature_finder(features, lables, feature_name_list):
    # set up a univerate learner
    univ = SelectKBest(score_func=chi2, k=Dominate_Component_Number)
    # actually run the fit algorithm
    fit = univ.fit(features, lables)
    # transform our data using the learned transform
    reduced_feature_data = fit.transform(features)
    
        
    return(get_important_features(fit.scores_, Dominate_Component_Number, feature_name_list))


In [449]:
# This will calculate the combined univariate selection results
print("Univariate Feature selction results:")
univ_features = {}
for val in ordered_UNSW_NB15_Label_names:
    if (val != 'Normal'):
        binary_criteria = ['Normal', val]
        print('\nFor attack_cat of', val, 'the dominant features are:')
        binary_data = normalized_data_univ[normalized_data_univ.attack_cat.isin(binary_criteria)]
        # binary_data.head(10)

        # separate features (columns 1..42) and labels (column 43, 44)
        binary_features= binary_data.iloc[:,:binary_data.shape[1]-2]
        binary_attack_labels=binary_data['attack_cat']
        # print('features shape is:', binary_features.shape, 'labels shape is:', binary_attack_labels.shape)

        # print("PCA Feature selction results:")
        # print(PCA_feature_finder(binary_features, binary_attack_labels, feature_only_list))


        univ_features.update(UniV_feature_finder(binary_features, binary_attack_labels, feature_only_list))
        # print("Univariate Feature selction results:")
        print(UniV_feature_finder(binary_features, binary_attack_labels, feature_only_list))
        
sorted_univ_features = sorted(univ_features, key=univ_features.__getitem__, reverse=True)
print("\n\nCumulative Univariate Selection Results are sorted bleow: ")
print(sorted_univ_features)

Univariate Feature selction results:

For attack_cat of Generic the dominant features are:
{'swin': 12938.096142578797, 'dwin': 12090.01627589768, 'service': 10091.293712046816, 'dttl': 6847.0370974391135, 'ct_dst_sport_ltm': 6141.410097600104, 'stcpb': 6021.5869011116465, 'dtcpb': 5982.515206166755, 'ct_dst_src_ltm': 5292.06587324085, 'ct_src_dport_ltm': 5235.625993564707, 'ct_srv_dst': 4867.104349692721}

For attack_cat of Exploits the dominant features are:
{'proto': 4224.09009496089, 'dttl': 1280.6684514157303, 'dload': 644.8548009907722, 'state': 528.4159173062278, 'sttl': 340.37443685599095, 'is_sm_ips_ports': 275.59221621621623, 'sinpkt': 264.2657396487954, 'ct_srv_dst': 154.84281873737962, 'ct_state_ttl': 146.62430670940444, 'ct_srv_src': 143.3547845257794}

For attack_cat of Fuzzers the dominant features are:
{'proto': 2600.177210642371, 'sttl': 2408.295172486197, 'state': 449.9898586596669, 'dmean': 422.82864908760473, 'dload': 397.80161425726664, 'dttl': 377.57332678251197, 

In [450]:
# Normalize the dataset
# z_scaler = StandardScaler()
# normalized_features = z_scaler.fit_transform(features)
# normalized_features[0:3]

In [451]:
#normalized_features = features.sub(features.mean(axis=0), axis=1)
normalized_features_PCA = features
normalized_features_PCA = normalized_features_PCA.sub(normalized_features_PCA.mean(axis=0), axis=1)
normalized_features_PCA = normalized_features_PCA.divide(normalized_features_PCA.std(axis=0), axis=1)
normalized_features_PCA.describe()


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
count,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,...,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0
mean,3.476857e-15,7.703188e-14,-2.705665e-13,6.04764e-14,1.072367e-14,1.193902e-14,-2.574517e-15,-2.99001e-15,4.654311e-14,1.036637e-13,...,-5.034774e-14,-1.512094e-13,3.029995e-13,5.44592e-14,-2.75541e-15,-2.002353e-14,3.314717e-15,4.094806e-14,-1.086325e-14,-7.248588e-14
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.2137285,-0.3476074,-0.7195431,-0.7928335,-0.1319217,-0.1518155,-0.04643325,-0.08736818,-0.554506,-1.782698,...,-0.5636564,-0.4683088,-0.4501838,-0.5655936,-0.09085693,-0.09061681,-0.2031416,-0.6400289,-0.7341029,-0.1060694
25%,-0.2137268,-0.3476074,-0.7195431,-0.7928335,-0.1244543,-0.1518155,-0.0459089,-0.08736818,-0.5543135,-1.171941,...,-0.5636564,-0.4683088,-0.4501838,-0.5655936,-0.09085693,-0.09061681,-0.2031416,-0.6400289,-0.6441863,-0.1060694
50%,-0.2107271,-0.2687063,-0.7195431,0.1268775,-0.09458496,-0.1345106,-0.04346195,-0.08619304,-0.5366742,0.7194357,...,-0.444865,-0.4683088,-0.4501838,-0.3903885,-0.09085693,-0.09061681,-0.2031416,-0.4059445,-0.3744364,-0.1060694
75%,-0.06101248,-0.2687063,1.55062,0.1268775,-0.04978087,-0.0652909,-0.03911571,-0.08105676,0.193111,0.7194357,...,0.03030098,-0.1107209,-0.1120824,-0.1275808,-0.09085693,-0.09061681,-0.2031416,0.06222434,0.1650634,-0.1060694
max,12.52392,4.780966,3.820783,4.725432,79.358,95.18097,83.59119,96.68024,6.174047,0.7292866,...,6.32625,6.445058,5.804691,4.865765,21.8459,21.53449,24.84843,6.265461,4.750811,9.427673


In [457]:
normalized_data_PCA = normalized_features_PCA.merge(df_labels, how='inner', left_on='id', right_on='id')
normalized_data_PCA.head()

Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.213726,-0.347607,-0.719543,-0.792833,-0.124454,-0.151815,-0.043683,-0.087368,0.057181,0.719436,...,-0.450184,-0.477991,-0.090857,-0.090617,-0.203142,-0.640029,-0.644186,-0.106069,Normal,0
2,-0.213727,-0.347607,-0.719543,-0.792833,-0.124454,-0.151815,-0.036308,-0.087368,0.286563,0.719436,...,-0.450184,-0.477991,-0.090857,-0.090617,-0.203142,-0.640029,-0.644186,-0.106069,Normal,0
3,-0.213727,-0.347607,-0.719543,-0.792833,-0.124454,-0.151815,-0.040351,-0.087368,0.791205,0.719436,...,-0.450184,-0.390389,-0.090857,-0.090617,-0.203142,-0.640029,-0.55427,-0.106069,Normal,0
4,-0.213727,-0.347607,-0.719543,-0.792833,-0.124454,-0.151815,-0.04133,-0.087368,0.566919,0.719436,...,-0.450184,-0.390389,-0.090857,-0.090617,-0.203142,-0.522987,-0.55427,-0.106069,Normal,0
5,-0.213726,-0.347607,-0.719543,-0.792833,-0.124454,-0.151815,-0.034187,-0.087368,0.118349,0.719436,...,-0.450184,-0.390389,-0.090857,-0.090617,-0.203142,-0.522987,-0.55427,-0.106069,Normal,0


In [458]:
# This will calculate the combined univariate selection results
print("PCA Feature selction results:")
pca_features = {}
for val in ordered_UNSW_NB15_Label_names:
    if (val != 'Normal'):
        binary_criteria = ['Normal', val]
        print('\nFor attack_cat of', val, 'the dominant features are:')
        binary_data = normalized_data_PCA[normalized_data_PCA.attack_cat.isin(binary_criteria)]
        binary_data.head(10)

        # separate features (columns 1..42) and labels (column 43, 44)
        binary_features= binary_data.iloc[:,:binary_data.shape[1]-2]
        binary_attack_labels=binary_data['attack_cat']
        # print('features shape is:', binary_features.shape, 'labels shape is:', binary_attack_labels.shape)


        pca_features.update(PCA_feature_finder(binary_features, binary_attack_labels, feature_only_list))
        # print("PCA Feature selction results:")
        print(PCA_feature_finder(binary_features, binary_attack_labels, feature_only_list))

        #print("Univariate Feature selction results:")
        #print(UniV_feature_finder(binary_features, binary_attack_labels, feature_only_list))

        
sorted_pca_features = sorted(pca_features, key=pca_features.__getitem__, reverse=True)
print("\n\nCumulative Univariate Selection Results are sorted bleow: ")
print(sorted_pca_features)

PCA Feature selction results:

For attack_cat of Generic the dominant features are:
{'dwin': 0.11234833763131065, 'swin': 0.11002991784667461, 'stcpb': 0.09397486129450121, 'dtcpb': 0.09336462058759169, 'dttl': 0.08448533832147831, 'tcprtt': 0.0755812209342999, 'ackdat': 0.07181708565415147, 'synack': 0.06792054454489431, 'dmean': 0.05916734101703472, 'djit': 0.05320277399891214}

For attack_cat of Exploits the dominant features are:
{'spkts': 0.08883834215091664, 'ct_flw_http_mthd': 0.08882904600848597, 'sloss': 0.08699400639220765, 'sbytes': 0.08645968513700175, 'trans_depth': 0.08211560290847131, 'djit': 0.046961037388597304, 'dwin': 0.04438818957056326, 'swin': 0.03828769696172392, 'dtcpb': 0.037567783209654246, 'is_ftp_login': 0.037261538331276225}

For attack_cat of Fuzzers the dominant features are:
{'sjit': 0.11828569804000662, 'dinpkt': 0.1118058925369726, 'djit': 0.08880422615155649, 'tcprtt': 0.08509168805921553, 'synack': 0.08161253456808629, 'ackdat': 0.07423652604669327, 

In [459]:
sorted_pca_features

['dpkts',
 'dbytes',
 'dloss',
 'stcpb',
 'sbytes',
 'tcprtt',
 'dttl',
 'synack',
 'trans_depth',
 'ct_state_ttl',
 'ackdat',
 'dinpkt',
 'ct_ftp_cmd',
 'is_ftp_login',
 'sjit',
 'sinpkt',
 'djit',
 'ct_flw_http_mthd',
 'response_body_len',
 'dmean',
 'is_sm_ips_ports',
 'dwin',
 'sttl',
 'state',
 'spkts',
 'dur',
 'swin',
 'dtcpb',
 'sloss']

In [460]:
sorted_univ_features

['proto',
 'ct_dst_sport_ltm',
 'ct_src_dport_ltm',
 'sinpkt',
 'stcpb',
 'dtcpb',
 'is_sm_ips_ports',
 'rate',
 'dmean',
 'swin',
 'service',
 'sttl',
 'dttl',
 'ct_state_ttl',
 'ct_flw_http_mthd',
 'dload',
 'ct_srv_src',
 'state',
 'response_body_len',
 'ct_srv_dst',
 'dwin',
 'ct_dst_src_ltm']

In [461]:
# separate features (columns 1..42) and labels (column 43, 44)
features= normalized_data_PCA[sorted_pca_features]
#labels= selected_data.iloc[:,selected_data.shape[1]-1:]
attack_labels=normalized_data_PCA['attack_cat']
print('features shape is:', features.shape, 'labels shape is:', attack_labels.shape)
attack_labels.head(10)

features shape is: (82332, 29) labels shape is: (82332,)


id
1     Normal
2     Normal
3     Normal
4     Normal
5     Normal
6     Normal
7     Normal
8     Normal
9     Normal
10    Normal
Name: attack_cat, dtype: object

In [462]:
# Separate data in train set and test set
df= pd.DataFrame(features)
# create training and testing vars
# Note: train_size + test_size < 1.0 means we are subsampling
# Use small numbers for slow classifiers, as KNN, Radius, SVC,...
X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8, test_size=0.2, random_state=1)
print('X_train, y_train:', X_train.shape, y_train.shape)
print('X_test, y_test:', X_test.shape, y_test.shape)

X_train, y_train: (65865, 29) (65865,)
X_test, y_test: (16467, 29) (16467,)


In [463]:
# Training, choose model by commenting/uncommenting clf=
print('Training model...')
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)
#, max_features=0.8, min_samples_leaf=3, n_estimators=500, min_samples_split=3, random_state=10, verbose=1)
# clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, presort=False)
# clf= svm.SVC(kernel='rbf')
#clf= neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred, ordered_UNSW_NB15_Label_names)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred,ordered_UNSW_NB15_Label_names))


Training model...
Predicting...

Confusion matrix:
 [[7150    6   28  134    5    1    0    0   13    1]
 [  21 3743   57    8   22    1    0    0    1    0]
 [ 125   12 1570   88  405   23    2    1    0    0]
 [ 515    6  150  462   85    3    0    1    9    0]
 [  37    5  260   39  467    8    0    0    3    0]
 [  27    0   68    9   52  530    0    1    0    0]
 [   0    0   53   21   40    0    6    0    0    0]
 [   1    1   69   26   19    0    0    2    0    0]
 [  31    1    9   14    2    2    0    0    7    0]
 [   4    0    4    0    1    0    0    0    0    0]]
Computing performance metrics

Accuracy result: 0.8463593854375417

Classification report:
                precision    recall  f1-score   support

        Normal       0.90      0.97      0.94      7338
       Generic       0.99      0.97      0.98      3853
      Exploits       0.69      0.71      0.70      2226
       Fuzzers       0.58      0.38      0.45      1231
           DoS       0.43      0.57      0.49

In [464]:
# Function to plot confusion matrix
# https://gist.github.com/zachguo/10296432
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [1])  # 5 is value length
    empty_cell = " " * columnwidth
    empty_cell0 = " " * (columnwidth-1) + "-"
    
    # Begin CHANGES
    fst_empty_cell = (columnwidth-3)//2 * " " + "t/p" + (columnwidth-3)//2 * " "
    
    if len(fst_empty_cell) < len(empty_cell):
        fst_empty_cell = " " * (len(empty_cell) - len(fst_empty_cell)) + fst_empty_cell
    # Print header
    print("    " + fst_empty_cell, end=" ")
    # End CHANGES
    
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
        
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell0
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print(cell, end=" ")
        print()
        

In [465]:
print_cm(results, short_cat_name, True)

       t/p       Normal   Generic  Exploits   Fuzzers       DoS    Reconn  Analysis  Backdoor Shellcode     Worms 
       Normal    7150.0       6.0      28.0     134.0       5.0       1.0         -         -      13.0       1.0 
      Generic      21.0    3743.0      57.0       8.0      22.0       1.0         -         -       1.0         - 
     Exploits     125.0      12.0    1570.0      88.0     405.0      23.0       2.0       1.0         -         - 
      Fuzzers     515.0       6.0     150.0     462.0      85.0       3.0         -       1.0       9.0         - 
          DoS      37.0       5.0     260.0      39.0     467.0       8.0         -         -       3.0         - 
       Reconn      27.0         -      68.0       9.0      52.0     530.0         -       1.0         -         - 
     Analysis         -         -      53.0      21.0      40.0         -       6.0         -         -         - 
     Backdoor       1.0       1.0      69.0      26.0      19.0         -       

In [466]:
# separate features (columns 1..42) and labels (column 43, 44)
features= normalized_data_univ[sorted_univ_features]
#labels= selected_data.iloc[:,selected_data.shape[1]-1:]
attack_labels=normalized_data_PCA['attack_cat']
print('features shape is:', features.shape, 'labels shape is:', attack_labels.shape)
attack_labels.head(10)

features shape is: (82332, 22) labels shape is: (82332,)


id
1     Normal
2     Normal
3     Normal
4     Normal
5     Normal
6     Normal
7     Normal
8     Normal
9     Normal
10    Normal
Name: attack_cat, dtype: object

In [467]:
# Separate data in train set and test set
df= pd.DataFrame(features)
# create training and testing vars
# Note: train_size + test_size < 1.0 means we are subsampling
# Use small numbers for slow classifiers, as KNN, Radius, SVC,...
X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8, test_size=0.2, random_state=1)
print('X_train, y_train:', X_train.shape, y_train.shape)
print('X_test, y_test:', X_test.shape, y_test.shape)

X_train, y_train: (65865, 22) (65865,)
X_test, y_test: (16467, 22) (16467,)


In [468]:
# Training, choose model by commenting/uncommenting clf=
print('Training model...')
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)
#, max_features=0.8, min_samples_leaf=3, n_estimators=500, min_samples_split=3, random_state=10, verbose=1)
# clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, presort=False)
# clf= svm.SVC(kernel='rbf')
#clf= neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred, ordered_UNSW_NB15_Label_names)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred,ordered_UNSW_NB15_Label_names))


Training model...
Predicting...

Confusion matrix:
 [[7128    1   38   46    2  122    0    0    1    0]
 [  14 3715   82    2   17   22    1    0    0    0]
 [  76    3 1555  159  377   53    0    1    2    0]
 [ 136    1  167  845   53   27    0    1    1    0]
 [  24    6  378   46  351   14    0    0    0    0]
 [  59    1  111   39   48  426    0    1    2    0]
 [   1    0   55   32   21    0   11    0    0    0]
 [   4    0   65   40    8    0    0    1    0    0]
 [  10    1   19    5    0   30    0    0    1    0]
 [   0    0    4    1    1    3    0    0    0    0]]
Computing performance metrics

Accuracy result: 0.8521892269387259

Classification report:
                precision    recall  f1-score   support

        Normal       0.96      0.97      0.96      7338
       Generic       1.00      0.96      0.98      3853
      Exploits       0.63      0.70      0.66      2226
       Fuzzers       0.70      0.69      0.69      1231
           DoS       0.40      0.43      0.41

  'precision', 'predicted', average, warn_for)


In [469]:
print_cm(results, short_cat_name, True)

       t/p       Normal   Generic  Exploits   Fuzzers       DoS    Reconn  Analysis  Backdoor Shellcode     Worms 
       Normal    7128.0       1.0      38.0      46.0       2.0     122.0         -         -       1.0         - 
      Generic      14.0    3715.0      82.0       2.0      17.0      22.0       1.0         -         -         - 
     Exploits      76.0       3.0    1555.0     159.0     377.0      53.0         -       1.0       2.0         - 
      Fuzzers     136.0       1.0     167.0     845.0      53.0      27.0         -       1.0       1.0         - 
          DoS      24.0       6.0     378.0      46.0     351.0      14.0         -         -         -         - 
       Reconn      59.0       1.0     111.0      39.0      48.0     426.0         -       1.0       2.0         - 
     Analysis       1.0         -      55.0      32.0      21.0         -      11.0         -         -         - 
     Backdoor       4.0         -      65.0      40.0       8.0         -       