In [1]:
# Section that will be used to define all the required package imports
import pandas as pd
import numpy as np
import datetime as dt
import os
from dateutil.relativedelta import relativedelta
import scipy.stats as stats
import math

import pandas_profiling
from matplotlib.backends.backend_pdf import PdfPages

# For plotting bar and charts
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import f_regression
import statsmodels.formula.api as sm
import seaborn as sns
import itertools

from patsy import dmatrices


In [20]:
np.set_printoptions(formatter={'all':lambda x: str(x)})

In [2]:
nid_train = pd.read_csv("NSL_Dataset\\Train.csv",header = 0)
nid_train.columns = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count","serror_rate", "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","attack","last_flag"]
nid_train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [4]:
nid_test = pd.read_csv("NSL_Dataset\\Test.csv",header = 0,low_memory = False)
nid_test.columns = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count","serror_rate", "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","attack","last_flag"]

# Binomial Classification: Whether a connection is an attack or not?
### Using Logistic Regression to acheive the results

In [3]:
# Function to classify the attack variable in binary categorical Y variable
def binary_classification(x):
    if x == "normal":
        return 0
    else:
        return 1

In [4]:
nid_train["Attack_Binomial"] = nid_train.apply(lambda x: binary_classification(x["attack"]),axis =1)

In [7]:
nid_train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag,Attack_Binomial
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20,0
1,0,udp,other,SF,146,0,0,0,0,0,...,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,0
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,1
3,0,tcp,http,SF,232,8153,0,0,0,0,...,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,0
4,0,tcp,http,SF,199,420,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,0


In [8]:
nid_train.drop(['attack'],axis = 1,inplace=True)
nid_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 43 columns):
duration                       125973 non-null int64
protocol_type                  125973 non-null object
service                        125973 non-null object
flag                           125973 non-null object
src_bytes                      125973 non-null int64
dst_bytes                      125973 non-null int64
land                           125973 non-null int64
wrong_fragment                 125973 non-null int64
urgent                         125973 non-null int64
hot                            125973 non-null int64
num_failed_logins              125973 non-null int64
logged_in                      125973 non-null int64
num_compromised                125973 non-null int64
root_shell                     125973 non-null int64
su_attempted                   125973 non-null int64
num_root                       125973 non-null int64
num_file_creations             125973 

In [10]:
output = pandas_profiling.ProfileReport(nid_train)
output.to_file(output_file='NID_Profiling.html')

## Profiling output : Detailed Analysis

1. Ignoring warning for duplicate rows, since its a valid scenario and we do not have a unique identifier in our rows.
2. Most of the columns marked as Skewed are valid, since 0 entries imply that that particular network feature wasn't utuilised completely.
3. High cardinality column Service will be processed via IV and WoE to determine the most relevant categories. 
4. For the columns marked as rejected, we will determine the final columns after checking on the correlation matrix
5. **num_outbound_cmds** column is dropped henceforth since it has only 1 value (zero)

> We will also be avoiding Outlier capping, since the data skewness is high in columns

In [9]:
# copying the dataframes in case need to start over :D
nid_train_1 = nid_train.copy()
nid_test_1 = nid_test.copy()

In [10]:
# Only numeric list
num_list = ['duration','src_bytes','dst_bytes','wrong_fragment','urgent','hot','num_failed_logins','num_compromised','num_root','num_file_creations','num_shells','num_access_files','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate']
nid_train_s = nid_train.reindex(columns=num_list)

In [19]:
# Further checking for correlation between the Continuous variables
corr = nid_train_s.corr()
corr.to_csv("Correlation_matrix.csv")

### Utilizing Profile report to drop highlighy correlated columns

#### Report details for "Rejected" variables

<img src="NID_rejected_var.png" alt="Alt text that describes the graphic" title="Screen-grab of Rejected variables" />

#### Following are the list of columns we are rejecting:
1. **dst_host_srv_rerror_rate** since we have <font color = blue>rerror_rate</font>, which is giving us a similar measure for all the connections to the same IP address
2. **dst_host_srv_serror_rate** since we have <font color = blue>serror_rate</font>, which is giving us a similar measure for all the connections to the same IP address
3. **num_compromised** since <font color = blue>num_root</font>, gives a more rounded measure of the connections attempted as the root on the connection
4. **srv_rerror_rate** since <font color = blue>rerror_rate</font> gives similar measure for same host IP address
5. **srv_serror_rate** since <font color = blue>serror_rate</font> gives similar measure for same host IP address
<br><br>
Other columns dropped as per Correlation matrix:<br>
a. same_srv_rate<br>
b. dst_host_same_srv_rate<br>
c. dst_host_serror_rate<br>
d. dst_host_rerror_rate



In [11]:
num_list = ['duration','src_bytes','dst_bytes','wrong_fragment','urgent','hot','num_failed_logins','num_root','num_file_creations','num_shells','num_access_files','count','srv_count','serror_rate','rerror_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate']
cat_list = ['last_flag','protocol_type','service','flag','land','logged_in','root_shell','su_attempted','is_host_login','is_guest_login','Attack_Binomial']

print(len(num_list))
print(len(cat_list))

22
11


In [6]:
# List with negligible collinearity values + categorical Y column (Attack_Binomial)
num_list = ['Attack_Binomial','duration','src_bytes','dst_bytes','wrong_fragment','urgent','hot','num_failed_logins','num_root','num_file_creations','num_shells','num_access_files','count','srv_count','serror_rate','rerror_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate']
nid_train_num = nid_train.reindex(columns=num_list)
nid_train_num.head()

Unnamed: 0,Attack_Binomial,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_root,num_file_creations,...,srv_count,serror_rate,rerror_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate
0,0,0,491,0,0,0,0,0,0,0,...,2,0.0,0.0,0.0,0.0,150,25,0.03,0.17,0.0
1,0,0,146,0,0,0,0,0,0,0,...,1,0.0,0.0,0.15,0.0,255,1,0.6,0.88,0.0
2,1,0,0,0,0,0,0,0,0,0,...,6,1.0,0.0,0.07,0.0,255,26,0.05,0.0,0.0
3,0,0,232,8153,0,0,0,0,0,0,...,5,0.2,0.0,0.0,0.0,30,255,0.0,0.03,0.04
4,0,0,199,420,0,0,0,0,0,0,...,32,0.0,0.0,0.0,0.09,255,255,0.0,0.0,0.0


### ANOVA test - To further narrow down the continuous varibale list
<br>
We have 22 continuous variables as of now. To further narrow our scope, we will perform ANOVA.<br>
We will keep the variables with higher F-Values.

In [7]:
p_value = []
f_value = []
for i in nid_train_num.columns.difference(['Attack_Binomial']):
    attack0 = nid_train_num[i].loc[nid_train_num.Attack_Binomial==0]
    attack1 = nid_train_num[i].loc[nid_train_num.Attack_Binomial==1]
    anova = stats.f_oneway(attack0,attack1)
    p_value.append(anova.pvalue)
    f_value.append(anova.statistic)
f_value = pd.Series(f_value,name="F_Value")
p_value = pd.Series(p_value, name = "p_value")
column_names = pd.Series(nid_train_num.columns.difference(['Attack_Binomial']),name="Var Names")
anova_test = pd.concat([column_names,f_value,p_value],axis = 1)
anova_test = anova_test.sort_values(by=['F_Value'])

In [15]:
attack0.count()

67343

In [15]:
anova_test

Unnamed: 0,Var Names,F_Value,p_value
18,srv_count,0.074827,0.7844353
20,urgent,0.978494,0.3225729
11,num_failed_logins,1.776289,0.1826081
2,dst_bytes,2.13576,0.1439016
17,src_bytes,4.416958,0.0355854
14,num_shells,11.302973,0.000774061
13,num_root,16.524468,4.805543e-05
9,hot,21.56687,3.420481e-06
12,num_file_creations,57.020613,4.341562e-14
10,num_access_files,169.902604,8.233575000000001e-39


### Columns dropped after ANOVA test
<font color =teal>Columns dropped where p-value was greater than 0.05 </font>
1. srv_count
2. urgent
3. num_failed_logins
4. dst_bytes

<font color =teal>Columns dropped where f-value less than 50  </font>
1. src_bytes
2. num_shells
3. num_root
4. hot


### Similar reduction Check on Categorical Columns
<br>
Dropping the column "service" due to <b> High cardinality</b>

In [16]:
# list of all the categorical variables in NID Train dataset
cat_list = ['protocol_type', 'flag','Attack_Binomial','land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']
nid_train_cat = nid_train.reindex(columns=cat_list)
nid_train_cat.to_csv("Categ_data.csv")
nid_train_cat.head()

Unnamed: 0,protocol_type,flag,Attack_Binomial,land,logged_in,root_shell,su_attempted,is_host_login,is_guest_login
0,tcp,SF,0,0,0,0,0,0,0
1,udp,SF,0,0,0,0,0,0,0
2,tcp,S0,1,0,0,0,0,0,0
3,tcp,SF,0,0,1,0,0,0,0
4,tcp,SF,0,0,1,0,0,0,0


In [17]:
#Dummy code the protocol type and flag columns
protocol_type_dummy= pd.get_dummies(nid_train_cat['protocol_type'], prefix='protocol', drop_first=True)
flag_dummy = pd.get_dummies(nid_train_cat['flag'], prefix='flag', drop_first=True)
nid_train_cat = pd.concat([nid_train_cat, protocol_type_dummy,flag_dummy], axis=1)
nid_train_cat.drop( 'protocol_type', axis = 1, inplace = True )
nid_train_cat.drop( 'flag', axis = 1, inplace = True )
nid_train_cat.head()

Unnamed: 0,Attack_Binomial,land,logged_in,root_shell,su_attempted,is_host_login,is_guest_login,protocol_tcp,protocol_udp,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0


### Conducting Chi Square Contingency test to reduce the number of categorical variables

In [18]:
p_value = []
chi2_score = []
for i in nid_train_cat.columns.difference(['Attack_Binomial']):
    xtab = pd.crosstab(nid_train_cat.Attack_Binomial, nid_train_cat[i], margins = True)
    x2test = stats.chi2_contingency(observed= xtab)
    p_value.append(x2test[1])
    chi2_score.append(x2test[0])
chi2_score = pd.Series(chi2_score,name="chi2_score")
p_value = pd.Series(p_value, name = "p_value")
cat_var_names = pd.Series(nid_train_cat.columns.difference(['Attack_Binomial']),name="Var Names")
Chi2_test = pd.concat([cat_var_names,chi2_score,p_value],axis = 1)
Chi2_test = Chi2_test.sort_values(by=['p_value'])

In [19]:
Chi2_test

Unnamed: 0,Var Names,chi2_score,p_value
0,flag_REJ,4309.05363,0.0
15,protocol_udp,5942.029422,0.0
13,logged_in,60005.430809,0.0
4,flag_S0,53257.319016,0.0
8,flag_SF,72052.592498,0.0
3,flag_RSTR,2231.667416,0.0
1,flag_RSTO,988.706851,9.998856000000001e-213
14,protocol_tcp,355.546994,1.1123830000000002e-75
5,flag_S1,303.867102,1.587084e-64
9,flag_SH,303.390846,2.010671e-64


### Columns dropped after Chi Square Contingency test
<font color =teal>Columns dropped where p-value was greater than 0.05 </font>
1. land
2. is_host_login

<font color =teal>Columns dropped where chi square score is less than 50  </font>
1. flag_S3

In [20]:
#final list of columns to work with
final_list_wo_dummy = ['Attack_Binomial','duration','wrong_fragment','num_file_creations','num_access_files','count','serror_rate','rerror_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','logged_in', 'root_shell', 'su_attempted','is_guest_login']
protocol_type_dummy= pd.get_dummies(nid_train['protocol_type'], prefix='protocol', drop_first=True)
flag_dummy = pd.get_dummies(nid_train['flag'], prefix='flag', drop_first=True)
nid_train_fin = nid_train.reindex(columns=final_list_wo_dummy)
nid_train_fin = pd.concat([nid_train_fin, protocol_type_dummy,flag_dummy], axis=1)

nid_train_fin.drop( 'flag_S3', axis = 1, inplace = True )



In [21]:
nid_train_fin.head()

Unnamed: 0,Attack_Binomial,duration,wrong_fragment,num_file_creations,num_access_files,count,serror_rate,rerror_rate,diff_srv_rate,srv_diff_host_rate,...,protocol_udp,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_SF,flag_SH
0,0,0,0,0,0,2,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,13,0.0,0.0,0.15,0.0,...,1,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,123,1.0,0.0,0.07,0.0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,5,0.2,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,30,0.0,0.0,0.0,0.09,...,0,0,0,0,0,0,0,0,1,0


In [23]:
#Performing F Regression Logit or univariate Logistic regression for all variables
somersd_df = pd.DataFrame()
for num_variable in nid_train_fin.columns.difference(['Attack_Binomial']):
    logreg = sm.logit(formula = str('Attack_Binomial ~ ')+str(num_variable), data=nid_train_fin)
    result = logreg.fit()
    #summ = result.summary2()
    y_score = pd.DataFrame(result.predict())
    y_score.columns = ['Score']
    somers_d = 2*metrics.roc_auc_score(nid_train_fin['Attack_Binomial'],y_score) - 1
    auroc = metrics.roc_auc_score(nid_train_fin['Attack_Binomial'],y_score)
    temp = pd.DataFrame([num_variable,auroc]).T
    temp.columns = ['Variable Name', 'AU_ROC_Curve']
    somersd_df = pd.concat([somersd_df, temp], axis=0)

somersd_df.sort_values(["AU_ROC_Curve"],ascending=False)
somersd_df

Optimization terminated successfully.
         Current function value: 0.474707
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.666611
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.615406
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.656414
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.686483
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.380619
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.688793
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.689516
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.673099
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.686488
  



Optimization terminated successfully.
         Current function value: 0.433831
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.689106
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.690350
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.366053
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.689201
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.689943
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.410201
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.689272
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.690326
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.689334
  



Unnamed: 0,Variable Name,AU_ROC_Curve
0,count,0.820805
0,diff_srv_rate,0.844736
0,dst_host_count,0.706789
0,dst_host_diff_srv_rate,0.826678
0,dst_host_same_src_port_rate,0.370419
0,dst_host_srv_count,0.892478
0,dst_host_srv_diff_host_rate,0.318157
0,duration,0.459189
0,flag_REJ,0.552835
0,flag_RSTO,0.509827


In [24]:
somersd_df.sort_values(["AU_ROC_Curve"],ascending=False,inplace=True)
somersd_df

Unnamed: 0,Variable Name,AU_ROC_Curve
0,dst_host_srv_count,0.892478
0,flag_SF,0.872156
0,diff_srv_rate,0.844736
0,logged_in,0.838309
0,dst_host_diff_srv_rate,0.826678
0,count,0.820805
0,serror_rate,0.806205
0,flag_S0,0.791564
0,dst_host_count,0.706789
0,srv_diff_host_rate,0.636594


In [37]:
#Variables chosen from Somer'sD test
var_list_SD = ['dst_host_srv_count','flag_SF','diff_srv_rate','logged_in','dst_host_diff_srv_rate','count','serror_rate','flag_S0','dst_host_count','srv_diff_host_rate']

### VIF Assessment to choose a second set of variable for model 2

In [26]:
vif_df = pd.concat([nid_train_fin[nid_train_fin.columns.difference(['Attack_Binomial'])],nid_train_fin['Attack_Binomial']], axis=1)
features = "+".join(nid_train_fin.columns.difference(['Attack_Binomial']))
vif_df.head()

Unnamed: 0,count,diff_srv_rate,dst_host_count,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_count,dst_host_srv_diff_host_rate,duration,flag_REJ,flag_RSTO,...,num_file_creations,protocol_tcp,protocol_udp,rerror_rate,root_shell,serror_rate,srv_diff_host_rate,su_attempted,wrong_fragment,Attack_Binomial
0,2,0.0,150,0.03,0.17,25,0.0,0,0,0,...,0,1,0,0.0,0,0.0,0.0,0,0,0
1,13,0.15,255,0.6,0.88,1,0.0,0,0,0,...,0,0,1,0.0,0,0.0,0.0,0,0,0
2,123,0.07,255,0.05,0.0,26,0.0,0,0,0,...,0,1,0,0.0,0,1.0,0.0,0,0,1
3,5,0.0,30,0.0,0.03,255,0.04,0,0,0,...,0,1,0,0.0,0,0.2,0.0,0,0,0
4,30,0.0,255,0.0,0.0,255,0.0,0,0,0,...,0,1,0,0.0,0,0.0,0.09,0,0,0


In [27]:
features

'count+diff_srv_rate+dst_host_count+dst_host_diff_srv_rate+dst_host_same_src_port_rate+dst_host_srv_count+dst_host_srv_diff_host_rate+duration+flag_REJ+flag_RSTO+flag_RSTOS0+flag_RSTR+flag_S0+flag_S1+flag_S2+flag_SF+flag_SH+is_guest_login+logged_in+num_access_files+num_file_creations+protocol_tcp+protocol_udp+rerror_rate+root_shell+serror_rate+srv_diff_host_rate+su_attempted+wrong_fragment'

In [28]:
#Calculating VIF(Variance Inflation Factor) for each variable
a,b = dmatrices(formula_like='Attack_Binomial ~ '+ features, data = vif_df, return_type='dataframe')

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(b.values, i) for i in range(b.shape[1])]
vif["features"] = b.columns

display(vif)

Unnamed: 0,VIF Factor,features
0,1430.408157,Intercept
1,2.064932,count
2,1.574347,diff_srv_rate
3,1.981222,dst_host_count
4,2.217628,dst_host_diff_srv_rate
5,2.351576,dst_host_same_src_port_rate
6,2.8562,dst_host_srv_count
7,1.840381,dst_host_srv_diff_host_rate
8,1.517832,duration
9,142.099259,flag_REJ


> Dropping variables with VIF < 4

In [29]:
#Variables chosen from VIF test
var_list_vif = ['dst_host_srv_count','flag_S2','dst_host_same_src_port_rate','dst_host_diff_srv_rate','flag_RSTOS0','count','su_attempted','dst_host_count','dst_host_srv_diff_host_rate','root_shell','diff_srv_rate','duration','num_access_files','srv_diff_host_rate','wrong_fragment','is_guest_login','num_file_creations']

In [42]:
protocol_type_dummy= pd.get_dummies(nid_train['protocol_type'], prefix='protocol', drop_first=True)
flag_dummy = pd.get_dummies(nid_train['flag'], prefix='flag', drop_first=True)
nid_train_df = nid_train.copy()
nid_train_df = pd.concat([nid_train_df, protocol_type_dummy,flag_dummy], axis=1)

nid_train_df.drop( 'protocol_type', axis = 1, inplace = True )
nid_train_df.drop( 'flag', axis = 1, inplace = True )


#Train and Test split
train, test = train_test_split(nid_train_df, test_size=0.2, random_state=42)

## Creating Logistic Regression Model No. 1
### Based on Somer's D

In [38]:
var_list_SD = "+".join(var_list_SD)
var_list_SD

'dst_host_srv_count+flag_SF+diff_srv_rate+logged_in+dst_host_diff_srv_rate+count+serror_rate+flag_S0+dst_host_count+srv_diff_host_rate'

In [43]:
sd_logreg = sm.logit(formula='Attack_Binomial ~ '+var_list_SD, data=train)
sd_result = sd_logreg.fit()

Optimization terminated successfully.
         Current function value: 0.197118
         Iterations 9


In [44]:
 print(sd_result.summary2())

                            Results: Logit
Model:                Logit              Pseudo R-squared:   0.715     
Dependent Variable:   Attack_Binomial    AIC:                39752.2971
Date:                 2020-04-04 22:20   BIC:                39857.0245
No. Observations:     100778             Log-Likelihood:     -19865.   
Df Model:             10                 LL-Null:            -69606.   
Df Residuals:         100767             LLR p-value:        0.0000    
Converged:            1.0000             Scale:              1.0000    
No. Iterations:       9.0000                                           
-----------------------------------------------------------------------
                        Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
-----------------------------------------------------------------------
Intercept               1.3450   0.0419  32.1211 0.0000  1.2629  1.4271
dst_host_srv_count     -0.0144   0.0002 -83.5898 0.0000 -0.0148 -0.0141
flag_SF              

- Dropping the column *dst_host_count* since p-value > 0.05
- Build the equation
> `LE = -1.3450-0.0144*dst_host_srv_count-1.3839*flag_SF-0.6464*diff_srv_rate-1.6465*logged_in+0.2648*dst_host_diff_srv_rate+0.0108*count+0.6932*serror_rate+1.9078*flag_S0+1.9576*srv_diff_host_rate`

### Diagnosing the Model 1

In [45]:
#AUC for both training and testing data
sd_train_auc = metrics.roc_auc_score(train['Attack_Binomial'], sd_result.predict(train))
sd_test_auc = metrics.roc_auc_score(test['Attack_Binomial'], sd_result.predict(test))

print("The AUC for the model built on the Train Data is : ", sd_train_auc)
print("The AUC for the model built on the Test Data is : ", sd_test_auc)

The AUC for the model built on the Train Data is :  0.9733220518295712
The AUC for the model built on the Test Data is :  0.9727761798294294


> As AUROC for both test and train data is very similar, we can say that the model seems to be holding good

In [48]:
sd_train_predict = pd.DataFrame(sd_result.predict(train))

sd_train_predict.columns = ['prob']
train_actual = train['Attack_Binomial']
sd_train_predict = pd.concat([train_actual, sd_train_predict], axis=1)
sd_train_predict.columns = ['actual','prob']
sd_train_predict.head(10)

Unnamed: 0,actual,prob
95141,0,0.007646
37486,1,0.995383
34926,1,0.983642
34589,0,0.005513
11420,0,0.307089
46955,1,0.997895
32661,1,0.715199
21066,1,0.517419
22128,1,0.97764
21455,1,0.288117


In [55]:
#Preparing test dataset same way
sd_test_predicted = pd.DataFrame(sd_result.predict(test))
sd_test_predicted.columns = ['prob']
test_actual = test['Attack_Binomial']
# making a DataFrame with actual and prob columns
sd_test_predict = pd.concat([test_actual, sd_test_predicted], axis=1)
sd_test_predict.columns = ['actual','prob']
sd_test_predict.head()

Unnamed: 0,actual,prob
378,0,0.263675
32038,1,0.974984
86399,1,0.998699
74412,1,0.887978
52951,1,0.815277


In [50]:
# Now We need to find the most ideal cut-off to carry out our predictions
roc_like_df = pd.DataFrame()

for cut_off in np.linspace(0.1,0.9,num=100):
    sd_train_predict['cut_off'] = cut_off
    sd_train_predict['predicted'] = sd_train_predict['prob'].apply(lambda x: 0.0 if x < cut_off else 1.0)
    sd_train_predict['tp'] = sd_train_predict.apply(lambda x: 1.0 if x['actual']==1.0 and x['predicted']==1 else 0.0, axis=1)
    sd_train_predict['fp'] = sd_train_predict.apply(lambda x: 1.0 if x['actual']==0.0 and x['predicted']==1 else 0.0, axis=1)
    sd_train_predict['tn'] = sd_train_predict.apply(lambda x: 1.0 if x['actual']==0.0 and x['predicted']==0 else 0.0, axis=1)
    sd_train_predict['fn'] = sd_train_predict.apply(lambda x: 1.0 if x['actual']==1.0 and x['predicted']==0 else 0.0, axis=1)
    sensitivity = sd_train_predict['tp'].sum() / (sd_train_predict['tp'].sum() + sd_train_predict['fn'].sum())
    specificity = sd_train_predict['tn'].sum() / (sd_train_predict['tn'].sum() + sd_train_predict['fp'].sum())
    sen_spec = sensitivity+specificity
    accuracy = (sd_train_predict['tp'].sum()  + sd_train_predict['tn'].sum() ) / (sd_train_predict['tp'].sum() + sd_train_predict['fn'].sum() + sd_train_predict['tn'].sum() + sd_train_predict['fp'].sum())
    roc_like_table = pd.DataFrame([cut_off, sensitivity, specificity, sen_spec, accuracy]).T
    roc_like_table.columns = ['cutoff', 'sensitivity', 'specificity', 'Sensitivity+Specificity', 'accuracy']
    roc_like_df = pd.concat([roc_like_df, roc_like_table], axis=0)

In [53]:
#Cut-off based on highest sensitivity + specificity
roc_like_df[roc_like_df['Sensitivity+Specificity']==roc_like_df['Sensitivity+Specificity'].max()]


Unnamed: 0,cutoff,sensitivity,specificity,Sensitivity+Specificity,accuracy
0,0.536364,0.881789,0.968305,1.850095,0.92808


In [52]:
#Cut-off based on highest accuracy   - some teams use this as methodology to decide the cut-off
roc_like_df[roc_like_df['accuracy']==roc_like_df['accuracy'].max()]

Unnamed: 0,cutoff,sensitivity,specificity,Sensitivity+Specificity,accuracy
0,0.536364,0.881789,0.968305,1.850095,0.92808


In [57]:
# We are going with cutoff = 0.536364
sd_train_predict['predicted'] = sd_train_predict['prob'].apply(lambda x: 1 if x > 0.536364 else 0)
sd_test_predict['predicted'] = sd_test_predict['prob'].apply(lambda x: 1 if x > 0.536364 else 0)

### Final parameters for Model 1

In [58]:
print("The overall accuracy score for the Train Data is : ", metrics.accuracy_score(sd_train_predict.actual, sd_train_predict.predicted))
print("The overall accuracy score for the Test Data  is : ", metrics.accuracy_score(sd_test_predict.actual, sd_test_predict.predicted))

The overall accuracy score for the Train Data is :  0.9280795411696997
The overall accuracy score for the Test Data  is :  0.925977376463584


In [59]:
print(metrics.classification_report(sd_train_predict.actual, sd_train_predict.predicted))

              precision    recall  f1-score   support

           0       0.90      0.97      0.94     53921
           1       0.96      0.88      0.92     46857

   micro avg       0.93      0.93      0.93    100778
   macro avg       0.93      0.93      0.93    100778
weighted avg       0.93      0.93      0.93    100778



In [60]:
print(metrics.classification_report(sd_test_predict.actual, sd_test_predict.predicted))

              precision    recall  f1-score   support

           0       0.90      0.97      0.93     13422
           1       0.96      0.88      0.92     11773

   micro avg       0.93      0.93      0.93     25195
   macro avg       0.93      0.92      0.93     25195
weighted avg       0.93      0.93      0.93     25195



## Creating Logistic Regression Model No. 2
### Based on VIF

In [61]:
var_list_vif = ['dst_host_srv_count','flag_S2','dst_host_same_src_port_rate','dst_host_diff_srv_rate','flag_RSTOS0','count','su_attempted','dst_host_count','dst_host_srv_diff_host_rate','root_shell','diff_srv_rate','duration','num_access_files','srv_diff_host_rate','wrong_fragment','is_guest_login','num_file_creations']
var_list_vif = "+".join(var_list_vif)
var_list_vif

'dst_host_srv_count+flag_S2+dst_host_same_src_port_rate+dst_host_diff_srv_rate+flag_RSTOS0+count+su_attempted+dst_host_count+dst_host_srv_diff_host_rate+root_shell+diff_srv_rate+duration+num_access_files+srv_diff_host_rate+wrong_fragment+is_guest_login+num_file_creations'

In [62]:
vif_logreg = sm.logit(formula='Attack_Binomial ~ '+var_list_vif, data=train)
vif_result = vif_logreg.fit()

         Current function value: 0.227171
         Iterations: 35




In [63]:
 print(vif_result.summary2())

                                     Results: Logit
Model:                       Logit                   Pseudo R-squared:        0.671     
Dependent Variable:          Attack_Binomial         AIC:                     45823.6103
Date:                        2020-04-04 23:17        BIC:                     45994.9824
No. Observations:            100778                  Log-Likelihood:          -22894.   
Df Model:                    17                      LL-Null:                 -69606.   
Df Residuals:                100760                  LLR p-value:             0.0000    
Converged:                   0.0000                  Scale:                   1.0000    
No. Iterations:              35.0000                                                    
----------------------------------------------------------------------------------------
                             Coef.   Std.Err.      z     P>|z|     [0.025       0.975]  
----------------------------------------------------------

- Dropping this model since the AIC is quite large compared to previous model
- Also there are quite a few variables with p-value > 0.05, hence the variable selection doesn't inspire confidence either..... :p

## Applying Model 1 to the Test data (unknown data)

In [64]:
protocol_type_dummy= pd.get_dummies(nid_test['protocol_type'], prefix='protocol', drop_first=True)
flag_dummy = pd.get_dummies(nid_test['flag'], prefix='flag', drop_first=True)
nid_test_df = nid_test.copy()
nid_test_df = pd.concat([nid_test_df, protocol_type_dummy,flag_dummy], axis=1)

nid_test_df.drop( 'protocol_type', axis = 1, inplace = True )
nid_test_df.drop( 'flag', axis = 1, inplace = True )

In [65]:
nid_test_df.shape

(22543, 53)

In [67]:
nid_test_df['probability'] = sd_result.predict(nid_test_df)

In [69]:
nid_test_df['Attack_Binomial'] = nid_test_df['probability'].apply(lambda x: 1 if x > 0.536364 else 0)

In [70]:
nid_test_df.Attack_Binomial.value_counts()

0    14368
1     8175
Name: Attack_Binomial, dtype: int64

In [71]:
nid_test_df.to_csv("Test_with_Attack_data.csv")

#### We are classifying 14368 entries as "normal" connection and 8175 entries as "attack" to the system