In [1]:
# import libraries
#import boto3, re, sys, math, json, os, sagemaker, 
import urllib.request
import numpy as np                                
import pandas as pd  
import matplotlib
import matplotlib.pyplot as plt  
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime       
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import neighbors
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.mixture import GaussianMixture
#from sagemaker.predictor import csv_serializer  
#from sagemaker import get_execution_role

# Global variables
Data_Download_Completed = True
Use_Small_DataSet = False
Small_Data_Already_Split = True
Dominate_Component_Number = 10
Feature_Weight_Step_Threshold = 0.01

  from numpy.core.umath_tests import inner1d


In [2]:
# Grab the necessary dataset files from the origin sources
data_dir="/home/will/Desktop/CSC8515/"
unsw_nb15_mainurl = 'https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/'
unsw_nb15_training_file = 'UNSW_NB15_training-set.csv'
unsw_nb15_feature_file = 'NUSW-NB15_features.csv' # misspelling in the origin data file name
unsw_nb15_test_file = 'UNSW_NB15_testing-set.csv'
download_training_url = unsw_nb15_mainurl + unsw_nb15_training_file
download_feature_url = unsw_nb15_mainurl + unsw_nb15_feature_file
download_test_url = unsw_nb15_mainurl + unsw_nb15_test_file

if(Data_Download_Completed !=True):
    try:
      urllib.request.urlretrieve (download_feature_url, unsw_nb15_feature_file)
      print('Success: ' + unsw_nb15_feature_file)
    except Exception as e:
      print('Data load error: ',e)

    try:
      urllib.request.urlretrieve (download_training_url, unsw_nb15_training_file)
      print('Success: ' + unsw_nb15_training_file)
    except Exception as e:
      print('Data load error: ',e)


In [3]:
# Load data
# Must declare data_dir as the directory of training and test files
raw_data_filename = data_dir + unsw_nb15_training_file
if(Use_Small_DataSet !=True):
    print('Loading raw data')
    try:
        raw_data = pd.read_csv(raw_data_filename, index_col=0, header=0)
        print('Success: Data loaded into dataframe.')
    except Exception as e:
        print('Data load error: ',e)
    selected_data = raw_data
    print(selected_data.shape)


Loading raw data
Success: Data loaded into dataframe.
(82332, 44)


In [4]:

# Split smaller dataset for experimentation
samll_filename = data_dir + 'UNSW_NB15_training-small.csv'
if((Use_Small_DataSet ==True) and (Small_Data_Already_Split !=True)):
    remaining_data, selected_data = np.split(raw_data.sample(frac=1, random_state=1729), [int(0.80 * len(raw_data))])
    print(remaining_data.shape, selected_data.shape)
    selected_data.head(10)
    # Write the samll data set to disk
    selected_data.to_csv(samll_filename)

In [5]:
# Read back the small data
if(Use_Small_DataSet ==True):
    try:
        small_data = pd.read_csv(samll_filename, index_col=0, header=0)
        print('Success: Data loaded into dataframe.')
    except Exception as e:
        print('Data load error: ',e)
    selected_data = small_data
    print(selected_data.shape)


In [6]:
    selected_data.head()

Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
4,6e-06,udp,-,INT,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [7]:
# Now gather the list of feature names
print('Loading feature list')
try:
    feature_data = pd.read_csv(raw_data_filename, index_col=0, header=None, nrows=1)
    print('Success: Feature list loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)
#print(feature_data)
feature_list = feature_data.values[0]
feature_only_list = np.delete(feature_list, [(feature_list.shape[0]-2), (feature_list.shape[0]-1)])
print(feature_only_list.shape)
feature_only_list

Loading feature list
Success: Feature list loaded into dataframe.
(42,)


array(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss',
       'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb',
       'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean',
       'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl',
       'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports'], dtype=object)

In [8]:
RFE_feature_list = ['state', 'dttl', 'dload', 'swin', 'dwin', 'synack', 'ct_state_ttl', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_srv_dst', 'proto', 'dbytes', 'dloss', 'ct_src_dport_ltm', 'spkts', 'sbytes', 'sttl', 'ct_srv_src', 'sjit', 'dmean', 'sloss', 'smean', 'trans_depth', 'dur']
Ultra_short_feature_list = ['ct_state_ttl', 'dbytes', 'dmean', 'dttl', 'dur', 'sbytes', 'state', 'sttl', 'synack']

In [9]:
print('Transforming data')
# Factorize columns: "proto", "service", "state", "attack_cat"
selected_data['proto'], protocols = pd.factorize(selected_data['proto'])
selected_data['service'], services = pd.factorize(selected_data['service'])
selected_data['state'], states    = pd.factorize(selected_data['state'])
#selected_data['attack_cat'], attacks = pd.factorize(selected_data['attack_cat'])
selected_data.head()

Transforming data


Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.1e-05,0,0,0,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
2,8e-06,0,0,0,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
3,5e-06,0,0,0,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
4,6e-06,0,0,0,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
5,1e-05,0,0,0,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [10]:
selected_data.describe()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
count,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,...,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0
mean,1.006756,8.811216,1.901739,0.862046,18.666472,17.545936,7993.908,13233.79,82410.89,180.967667,...,4.928898,3.663011,7.45636,0.008284,0.008381,0.129743,6.46836,9.164262,0.011126,0.5506
std,4.710444,25.348181,2.642982,1.087298,133.916353,115.574086,171642.3,151471.5,148620.4,101.513358,...,8.389545,5.915386,11.415191,0.091171,0.092485,0.638683,8.543927,11.121413,0.104891,0.497436
min,0.0,0.0,0.0,0.0,1.0,0.0,24.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,8e-06,0.0,0.0,0.0,2.0,0.0,114.0,0.0,28.60611,62.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0
50%,0.014138,2.0,0.0,1.0,6.0,2.0,534.0,178.0,2650.177,254.0,...,1.0,1.0,3.0,0.0,0.0,0.0,3.0,5.0,0.0,1.0
75%,0.71936,2.0,6.0,1.0,12.0,10.0,1280.0,956.0,111111.1,254.0,...,4.0,3.0,6.0,0.0,0.0,0.0,7.0,11.0,0.0,1.0
max,59.999989,130.0,12.0,6.0,10646.0,11018.0,14355770.0,14657530.0,1000000.0,255.0,...,59.0,38.0,63.0,2.0,2.0,16.0,60.0,62.0,1.0,1.0


In [13]:
# normalized_data = normalized_features.merge(df_labels, how='inner', left_on='id', right_on='id')
# normalized_data.head()

In [14]:
# Normalize the dataset
# z_scaler = StandardScaler()
# normalized_features = z_scaler.fit_transform(features)
# normalized_features[0:3]

In [15]:
# Select ultra_3_short based features and labels (column 43)
train_features= selected_data[Ultra_short_feature_list]
#labels= selected_data.iloc[:,selected_data.shape[1]-1:]
attack_labels=selected_data['label']
print('features shape is:', train_features.shape, 'labels shape is:', attack_labels.shape)
attack_labels.head(10)

features shape is: (82332, 9) labels shape is: (82332,)


id
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
Name: label, dtype: int64

In [16]:

#normalized_features = features.sub(features.mean(axis=0), axis=1)
normalized_features = train_features
normalized_features = normalized_features.sub(normalized_features.mean(axis=0), axis=1)
normalized_features = normalized_features.divide(normalized_features.std(axis=0), axis=1)
normalized_features.describe()


Unnamed: 0,ct_state_ttl,dbytes,dmean,dttl,dur,sbytes,state,sttl,synack
count,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0
mean,-2.263282e-14,-2.99001e-15,1.419867e-14,-2.205689e-13,3.476857e-15,-2.574517e-15,6.04764e-14,1.036637e-13,3.853426e-14
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.283066,-0.08736818,-0.4753677,-0.8203898,-0.2137285,-0.04643325,-0.7928335,-1.782698,-0.4129072
25%,-0.3460245,-0.08736818,-0.4753677,-0.8203898,-0.2137268,-0.0459089,-0.7928335,-1.171941,-0.4129072
50%,-0.3460245,-0.08619304,-0.2954824,-0.5718206,-0.2107271,-0.04346195,0.1268775,0.7194357,-0.4066831
75%,0.5910175,-0.08105676,-0.1196854,1.339591,-0.06101248,-0.03911571,0.1268775,0.7194357,0.3294059
max,4.339185,96.68024,5.657087,1.348162,12.52392,83.59119,4.725432,0.7292866,45.12877


In [17]:
# Separate data in train set and test set
df= pd.DataFrame(normalized_features)
# create training and testing vars
# Note: train_size + test_size < 1.0 means we are subsampling
# Use small numbers for slow classifiers, as KNN, Radius, SVC,...
X_train, X_test, y_train, y_test = train_test_split(df, attack_labels, train_size=0.8, test_size=0.2, random_state=1)
print('X_train, y_train:', X_train.shape, y_train.shape)
print('X_test, y_test:', X_test.shape, y_test.shape)

X_train, y_train: (65865, 9) (65865,)
X_test, y_test: (16467, 9) (16467,)


In [18]:
# Training, choose model by commenting/uncommenting clf= Random Forest
print('Training model...')
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)
#, max_features=0.8, min_samples_leaf=3, n_estimators=500, min_samples_split=3, random_state=10, verbose=1)
# clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, presort=False)
# clf= svm.SVC(kernel='rbf')
#clf= neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))


Training model...
Predicting...

Confusion matrix:
 [[6978  360]
 [ 661 8468]]
Computing performance metrics

Accuracy result: 0.9379972065342806

Classification report:
             precision    recall  f1-score   support

          0       0.91      0.95      0.93      7338
          1       0.96      0.93      0.94      9129

avg / total       0.94      0.94      0.94     16467



In [19]:
# Training, choose model by commenting/uncommenting clf= KNN = 1
print('Training model...')
clf= neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))

Training model...
Predicting...

Confusion matrix:
 [[6538  800]
 [ 894 8235]]
Computing performance metrics

Accuracy result: 0.897127588510354

Classification report:
             precision    recall  f1-score   support

          0       0.88      0.89      0.89      7338
          1       0.91      0.90      0.91      9129

avg / total       0.90      0.90      0.90     16467



In [20]:
# Training, choose model by commenting/uncommenting clf= KNN = 3
print('Training model...')
clf= neighbors.KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))

Training model...
Predicting...
Predited labels: 	 [0 1 0 ... 1 0 1]

Confusion matrix:
 [[6637  701]
 [ 977 8152]]
Computing performance metrics

Accuracy result: 0.8980992287605514

Classification report:
             precision    recall  f1-score   support

          0       0.87      0.90      0.89      7338
          1       0.92      0.89      0.91      9129

avg / total       0.90      0.90      0.90     16467



In [21]:
# Training, choose model by commenting/uncommenting clf= KNN = 5
print('Training model...')
clf= neighbors.KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))

Training model...
Predicting...

Confusion matrix:
 [[6717  621]
 [1017 8112]]
Computing performance metrics

Accuracy result: 0.9005283293860448

Classification report:
             precision    recall  f1-score   support

          0       0.87      0.92      0.89      7338
          1       0.93      0.89      0.91      9129

avg / total       0.90      0.90      0.90     16467



In [25]:
# Training, choose model by commenting/uncommenting clf= GMM = 2
print('Training model...')
clf= GaussianMixture(n_components = 2, covariance_type = 'full')


trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

print('Predited labels: \t', y_pred)
for i, c in enumerate(y_pred):
    if c != 0:
        y_pred[i]= 1
print('Predited labels: \t', y_pred)        
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))

Training model...
Predicting...
Predited labels: 	 [1 1 0 ... 1 0 1]
Predited labels: 	 [1 1 0 ... 1 0 1]

Confusion matrix:
 [[6150 1188]
 [3131 5998]]
Computing performance metrics

Accuracy result: 0.737717859962349

Classification report:
             precision    recall  f1-score   support

          0       0.66      0.84      0.74      7338
          1       0.83      0.66      0.74      9129

avg / total       0.76      0.74      0.74     16467



In [26]:
# Training, choose model by commenting/uncommenting clf= GMM = 10
print('Training model...')
clf= GaussianMixture(n_components = 10, covariance_type = 'full')


trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

print('Predited labels: \t', y_pred)
for i, c in enumerate(y_pred):
    if c != 0:
        y_pred[i]= 1
print('Predited labels: \t', y_pred)        
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))

Training model...
Predicting...
Predited labels: 	 [2 2 0 ... 2 6 2]
Predited labels: 	 [1 1 0 ... 1 1 1]

Confusion matrix:
 [[2323 5015]
 [2251 6878]]
Computing performance metrics

Accuracy result: 0.5587538713791219

Classification report:
             precision    recall  f1-score   support

          0       0.51      0.32      0.39      7338
          1       0.58      0.75      0.65      9129

avg / total       0.55      0.56      0.54     16467



In [27]:
# Training, choose model by commenting/uncommenting clf= K-Means = 2
from sklearn import cluster
print('Training model...')
clf= cluster.KMeans(n_clusters = 2)

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

print('Predited labels: \t', y_pred)
for i, c in enumerate(y_pred):
    if c != 0:
        y_pred[i]= 1
print('Predited labels: \t', y_pred)        
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))

Training model...
Predicting...
Predited labels: 	 [0 0 1 ... 0 1 0]
Predited labels: 	 [0 0 1 ... 0 1 0]

Confusion matrix:
 [[1189 6149]
 [6004 3125]]
Computing performance metrics

Accuracy result: 0.26197850245946436

Classification report:
             precision    recall  f1-score   support

          0       0.17      0.16      0.16      7338
          1       0.34      0.34      0.34      9129

avg / total       0.26      0.26      0.26     16467



In [29]:
# Training, choose model by commenting/uncommenting clf= K-Means = 10
from sklearn import cluster
print('Training model...')
clf= cluster.KMeans(n_clusters = 10)

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

print('Predited labels: \t', y_pred)
for i, c in enumerate(y_pred):
    if c != 0:
        y_pred[i]= 1
print('Predited labels: \t', y_pred)        
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))

Training model...
Predicting...
Predited labels: 	 [0 0 2 ... 0 3 0]
Predited labels: 	 [0 0 1 ... 0 1 0]

Confusion matrix:
 [[1001 6337]
 [5936 3193]]
Computing performance metrics

Accuracy result: 0.25469120058298417

Classification report:
             precision    recall  f1-score   support

          0       0.14      0.14      0.14      7338
          1       0.34      0.35      0.34      9129

avg / total       0.25      0.25      0.25     16467



In [70]:
train_features.head()

Unnamed: 0_level_0,ct_state_ttl,dbytes,dmean,dttl,dur,sbytes,state,sttl,synack
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2,0,0,0,1.1e-05,496,0,254,0.0
2,2,0,0,0,8e-06,1762,0,254,0.0
3,2,0,0,0,5e-06,1068,0,254,0.0
4,2,0,0,0,6e-06,900,0,254,0.0
5,2,0,0,0,1e-05,2126,0,254,0.0


In [33]:
from sklearn.ensemble import IsolationForest

# Training, choose model by commenting/uncommenting clf= Random Isolation Forest

print('Training model...')
clf = IsolationForest(random_state=42)

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

print('Predited labels: \t', y_pred)
for i, c in enumerate(y_pred):
    if c > 0:
        y_pred[i]= 1
    else:
        y_pred[i]= 0
print('Predited labels: \t', y_pred)        
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))

Training model...


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


Predicting...
Predited labels: 	 [1 1 1 ... 1 1 1]
Predited labels: 	 [1 1 1 ... 1 1 1]

Confusion matrix:
 [[1141 6197]
 [ 546 8583]]
Computing performance metrics

Accuracy result: 0.5905143620574482

Classification report:
             precision    recall  f1-score   support

          0       0.68      0.16      0.25      7338
          1       0.58      0.94      0.72      9129

avg / total       0.62      0.59      0.51     16467



In [31]:
#  clf= Class MLPClassifier implements a multi-layer perceptron (MLP) 
# algorithm that trains using Backpropagation.
from sklearn.neural_network import MLPClassifier

print('Training model...')
clf = MLPClassifier(solver='adam', random_state=42)

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))


Training model...
Predicting...
Predited labels: 	 [1 1 0 ... 1 0 1]

Confusion matrix:
 [[6287 1051]
 [ 938 8191]]
Computing performance metrics

Accuracy result: 0.8792129713973401

Classification report:
             precision    recall  f1-score   support

          0       0.87      0.86      0.86      7338
          1       0.89      0.90      0.89      9129

avg / total       0.88      0.88      0.88     16467



In [32]:
# Training, choose model by commenting/uncommenting clf= SVC - rbf kernel
print('Training model...')
# clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)
#, max_features=0.8, min_samples_leaf=3, n_estimators=500, min_samples_split=3, random_state=10, verbose=1)
# clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, presort=False)
clf= svm.SVC(kernel='rbf')
#clf= neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree', metric='manhattan')

trained_model= clf.fit(X_train, y_train)

# Predicting
print('Predicting...')
y_pred = clf.predict(X_test)
#print('Predited labels: \t', y_pred)
#print('True labels: \t\t', y_test)

results = confusion_matrix(y_test, y_pred)
print('\nConfusion matrix:\n', results)

print('Computing performance metrics')
print('\nAccuracy result:', accuracy_score(y_test, y_pred, normalize=True))


print("\nClassification report:")
print(classification_report(y_test,y_pred))


Training model...
Predicting...

Confusion matrix:
 [[6366  972]
 [1260 7869]]
Computing performance metrics

Accuracy result: 0.8644561850974677

Classification report:
             precision    recall  f1-score   support

          0       0.83      0.87      0.85      7338
          1       0.89      0.86      0.88      9129

avg / total       0.87      0.86      0.86     16467

