In [1]:
import math, time, random, datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
#import missingno
import pickle

from sklearn import metrics

### The dataset is being loaded here

In [2]:
dataSet_location = "UNSW_NB15_training-set.csv"
test_dataSet = "UNSW_NB15_testing-set.csv"

data_train = pd.read_csv(dataSet_location)
data_test  = pd.read_csv(test_dataSet)

In [3]:
def EDA_data(data):
    
    #taking the categorical data types of the columns
    cols_cat = data.select_dtypes('object').columns
    
    #then the numeric data types for each column data
    cols_numeric = data._get_numeric_data().columns 
    
    #cleaning the data 
    #data['service'].unique() 
    
    #remove the - and None values of the data set to 0
    data['service']= np.where(data['service'] == '-', 'None', data['service'])
    
    #print(data_train['service'].unique())
    #list of the data columns
    cols = data.columns
    
    #iterating over the columns list
    for col in cols:
        data[col] = np.where(data[col] == '-', 'None', data[col])
    
    #now removing the axis and the rest of the features
    data = data.drop(['id'], axis=1)
    
    data.drop(['attack_cat'], axis=1, inplace=True)
    
    #one hot encoding of the data
    cols_cat = cols_cat.drop(['attack_cat'])
    
    #using dummy columns for the data
    data_encoded = pd.get_dummies(data,columns=cols_cat)
    
    #now we will normalize the data so that the model is run smoothly
    cols_numeric = list(cols_numeric)
    
    cols_numeric.remove('label')
    cols_numeric.remove('id')
    
    #changing the data type to float, since they are one hot encoded it will be 0.0 or 1.0
    data_encoded[cols_numeric] = data_encoded[cols_numeric].astype('float') 
    
    data_encoded[cols_numeric] = (data_encoded[cols_numeric] - np.min(data_encoded[cols_numeric])) / np.std(data_encoded[cols_numeric])
    
    return data_encoded
    
    
    
        
    

In [4]:
def common_parameters_model(train, test):
    
    #find the set for test columns
    col_test = test.columns.to_list()
    set_test = set(col_test)
    
    #find the set for the train columns
    col_train = data_clean.columns.to_list()
    set_train = set(data_clean)
    
    #common parameters
    common_set = set_test.intersection(set_train)
    
    return common_set
    
    

#### The cleaning of the data is being done here

In [5]:
data_clean = EDA_data(data_train)
test_clean = EDA_data(data_test)

In [6]:
common_cols = common_parameters_model(data_clean, test_clean)

In [7]:
#now both the data sets need to have the same columns
data_clean = data_clean[common_cols]
test_clean = test_clean[common_cols]


In [8]:
from sklearn import model_selection
from sklearn import metrics
X = data_clean.drop('label', axis=1)
Y = data_clean['label']

In [9]:
#global X

In [10]:
def fit_algo(algo, x, y, cv,flag=0):
    #Fit the model
    model = algo.fit(x, y)
    
    #Check its score
    acc = round(model.score(x, y) *100, 2)
    y_pred = model_selection.cross_val_predict(algo, x, y, cv=cv, n_jobs = -1)
    
    acc_cv = round(metrics.accuracy_score(Y,y_pred)*100, 2)

    return y_pred, acc, acc_cv, model

In [11]:
def predict_algo(model, X, Y, cv):
    
    y_pred = model.predict(X)
    
    acc = metrics.accuracy_score(Y ,y_pred)
    
    #acc = round(model.score(Y,y_pred)*100,2)
    
    y_pred_cv = model_selection.cross_val_predict(model, X, Y, cv=cv, n_jobs = -1)
    
    acc_cv = round(metrics.accuracy_score(Y,y_pred_cv) * 100 ,2)
    
    return acc, acc_cv

### Decision Tree was used for classification

In [12]:
from sklearn.tree import DecisionTreeClassifier
start_time = time.time()
pred_now, acc_dt, acc_cv_dt, dt = fit_algo(DecisionTreeClassifier(random_state = 1)
                                        , X, Y, 10)

dt_time = (time.time() - start_time)

print("Accuracy: %s" % acc_dt)
print("Accuracy of CV: %s" % acc_cv_dt)
print("Execution time: %s" % dt_time)

Accuracy: 99.99
Accuracy of CV: 94.14
Execution time: 11.25972318649292


##### DT predictions

In [13]:
X_test = test_clean.drop('label', axis=1)
Y_test = test_clean['label']

acc, acc_cv_dt = predict_algo(dt, X_test, Y_test, 10)

dt_time = (time.time() - start_time)

print("Accuracy: %s" % acc)
print("Accuracy of CV: %s" % acc_cv_dt)
print("Execution time: %s" % dt_time)

Accuracy: 0.7450738846020041
Accuracy of CV: 92.34
Execution time: 25.914119482040405


In [14]:
from sklearn.linear_model import LogisticRegression
start_time = time.time()
pred_now, acc_lr, acc_cv_lr, lr = fit_algo(LogisticRegression(C=0.1)
                                        , X, Y, 10)

lr_time = (time.time() - start_time)

print("Accuracy: %s" % acc_lr)
print("Accuracy of CV: %s" % acc_cv_lr)
print("Execution time: %s" % lr_time)

Accuracy: 91.15
Accuracy of CV: 89.33
Execution time: 21.75216317176819


In [15]:
acc, acc_cv_lr = predict_algo(lr, X_test, Y_test, 10)

lr_time = (time.time() - start_time)

print("Accuracy: %s" % acc)
print("Accuracy of CV: %s" % acc_cv_lr)
print("Execution time: %s" % lr_time)

Accuracy: 0.8780718713820498
Accuracy of CV: 92.86
Execution time: 116.65996479988098


In [16]:
from sklearn.ensemble import RandomForestClassifier
start_time = time.time()
pred_now, acc_rf2, acc_cv_rf2, rf2 = fit_algo(RandomForestClassifier(n_estimators = 100, criterion='entropy')
                                        , X, Y, 10)

rf2_time = (time.time() - start_time)

print("Accuracy: %s" % acc_rf2)
print("Accuracy of CV: %s" % acc_cv_rf2)
print("Execution time: %s" % rf2_time)

Accuracy: 99.99
Accuracy of CV: 96.0
Execution time: 78.40587115287781


In [17]:
acc, acc_cv_lr = predict_algo(rf2, X_test, Y_test, 10)

rf_time = (time.time() - start_time)

print("Accuracy: %s" % acc)
print("Accuracy of CV: %s" % acc_cv_lr)
print("Execution time: %s" % lr_time)

Accuracy: 0.8978789900821827
Accuracy of CV: 92.85
Execution time: 116.65996479988098


In [20]:
from sklearn.neural_network import MLPClassifier

start_time = time.time()
pred_now, acc_nn, acc_cv_nn, nn = fit_algo(MLPClassifier(hidden_layer_sizes = (40,), activation='relu', solver='adam')
                                        , X, Y, 5)

nn_time = (time.time() - start_time)

print("Accuracy: %s" % acc_nn)
print("Accuracy of CV: %s" % acc_cv_nn)
print("Execution time: %s" % nn_time)

Accuracy: 96.89
Accuracy of CV: 92.98
Execution time: 473.3488404750824


In [21]:
acc, acc_cv_nn = predict_algo(nn, X_test, Y_test, 10)

nn_time = (time.time() - start_time)

print("Accuracy: %s" % acc)
print("Accuracy of CV: %s" % acc_cv_nn)
print("Execution time: %s" % nn_time)

Accuracy: 0.8971489839797879
Accuracy of CV: 92.32
Execution time: 1358.215576171875


In [22]:
from sklearn.naive_bayes import GaussianNB
start_time = time.time()

pred_now, acc_gnb, acc_cv_gnb, gnb= fit_algo(GaussianNB()
                                        ,X,Y,5)

gnb_time = (time.time() - start_time)

print("Accuracy: %s" % acc_gnb)
print("Accuracy of CV: %s" % acc_cv_gnb)
print("Execution time: %s" % gnb_time)

Accuracy: 65.28
Accuracy of CV: 65.54
Execution time: 11.431057214736938


In [23]:
acc, acc_cv_gnb = predict_algo(gnb, X_test, Y_test, 10)

gnb_time = (time.time() - start_time)

print("Accuracy: %s" % acc)
print("Accuracy of CV: %s" % acc_cv_gnb)
print("Execution time: %s" % gnb_time)

Accuracy: 0.6264421897901803
Accuracy of CV: 48.1
Execution time: 76.52244758605957


In [24]:
from sklearn.ensemble import GradientBoostingClassifier
start_time = time.time()

pred_now, acc_gbt, acc_cv_gbt, gbt= fit_algo(GradientBoostingClassifier()
                                        , X, Y, 10)

gbt_time = (time.time() - start_time)

print("Accuracy: %s" % acc_gbt)
print("Accuracy of CV: %s" % acc_cv_gbt)
print("Execution time: %s" % gbt_time)

Accuracy: 96.04
Accuracy of CV: 94.82
Execution time: 232.19419288635254


In [25]:
acc, acc_cv_gbt = predict_algo(gbt, X_test, Y_test, 10)

gbt_time = (time.time() - start_time)

print("Accuracy: %s" % acc)
print("Accuracy of CV: %s" % acc_cv_gbt)
print("Execution time: %s" % gbt_time)

Accuracy: 0.8868547573014868
Accuracy of CV: 92.41
Execution time: 687.0232815742493
