In [1]:
# Section that will be used to define all the required package imports
import pandas as pd
import numpy as np
import datetime as dt
import os
from dateutil.relativedelta import relativedelta
import scipy.stats as stats
import math

import pandas_profiling
from matplotlib.backends.backend_pdf import PdfPages

# For plotting bar and charts
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.feature_selection import RFE, RFECV, SelectKBest, chi2, SelectPercentile, f_classif, mutual_info_classif, f_regression, VarianceThreshold, SelectFromModel, mutual_info_classif, mutual_info_regression, SelectFpr, SelectFdr, SelectFwe
import statsmodels.formula.api as sm
import seaborn as sns
from sklearn.preprocessing import StandardScaler, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


In [2]:
np.set_printoptions(formatter={'all':lambda x: str(x)})

In [3]:
nid_train = pd.read_csv("NSL_Dataset\\Train.csv",header = 0)
nid_train.columns = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count","serror_rate", "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","attack","last_flag"]
nid_train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [4]:
nid_test = pd.read_csv("NSL_Dataset\\Test.csv",header = 0,low_memory = False)
nid_test.columns = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count","serror_rate", "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","attack","last_flag"]

# Multiclass Classification: What is the kind of Attack?
### Using KNN Classifier to identify attack type(DoS,Probe,R2L or U2R)
> a. DoS = 1<br>
> b. Probe = 2<br>
> c. R2L = 3<br>
> d. U2R = 4

In [5]:
# Function to classify the attack variable in multiclass categorical Y variable
def multi_classification(x):
    if x in (["back","land","neptune","pod","smurf","teardrop","apache2","udpstorm","processtable","worm"]):
        return 1
    elif x in (["satan","ipsweep","nmap","portsweep","mscan","saint"]):
        return 2
    elif x in (["guess_passwd","ftp_write","imap","phf","multihop","warezmaster","warezclient","spy","xlock","xsnoop","snmpguess","snmpgetattack","httptunnel","sendmail","named"]):
        return 3
    elif x in (["buffer_overflow","loadmodule","rootkit","perl","sqlattack","xterm","ps"]):
        return 4
    else:
        return 0
    

In [6]:
nid_train["Attack_Multiclass"] = nid_train.apply(lambda x: multi_classification(x["attack"]),axis =1)

In [7]:
nid_train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag,Attack_Multiclass
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20,0
1,0,udp,other,SF,146,0,0,0,0,0,...,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,0
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,1
3,0,tcp,http,SF,232,8153,0,0,0,0,...,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,0
4,0,tcp,http,SF,199,420,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,0


In [8]:
nid_train.drop(['attack'],axis = 1,inplace=True)
nid_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 43 columns):
duration                       125973 non-null int64
protocol_type                  125973 non-null object
service                        125973 non-null object
flag                           125973 non-null object
src_bytes                      125973 non-null int64
dst_bytes                      125973 non-null int64
land                           125973 non-null int64
wrong_fragment                 125973 non-null int64
urgent                         125973 non-null int64
hot                            125973 non-null int64
num_failed_logins              125973 non-null int64
logged_in                      125973 non-null int64
num_compromised                125973 non-null int64
root_shell                     125973 non-null int64
su_attempted                   125973 non-null int64
num_root                       125973 non-null int64
num_file_creations             125973 

In [9]:
nid_test["Attack_Multiclass"] = nid_test.apply(lambda x: multi_classification(x["attack"]),axis =1)
nid_test.drop(['attack'],axis = 1,inplace=True)
nid_test.shape

(22543, 43)

In [10]:
nid_train.shape

(125973, 43)

In [11]:
output = pandas_profiling.ProfileReport(nid_train)
output.to_file(output_file='NID_Profiling_Multinomial.html')

## Profiling output : Detailed Analysis

1. Ignoring warning for duplicate rows, since its a valid scenario as we do not have a unique identifier in our rows.
2. Most of the columns marked as Skewed are valid, since 0 entries imply that that particular network feature wasn't utuilised completely.
3. Following columns are going to be dropped:<br>
    a. **service** : due to High cardinality<br>
    b. **num_outbound_cmds** : since it has only 0 as value<br>

> We will also be avoiding Outlier capping, since the data skewness is high in columns

In [12]:
# copying the dataframes (in case need to start over)
nid_train_1 = nid_train.copy()
nid_test_1 = nid_test.copy()

In [13]:
#Dummy code the protocol type and flag columns
protocol_type_dummy= pd.get_dummies(nid_train['protocol_type'], prefix='protocol', drop_first=True)
flag_dummy = pd.get_dummies(nid_train['flag'], prefix='flag', drop_first=True)
nid_train = pd.concat([nid_train, protocol_type_dummy,flag_dummy], axis=1)
nid_train.drop( 'protocol_type', axis = 1, inplace = True )
nid_train.drop( 'flag', axis = 1, inplace = True )

#dropping the rejected columns as well
nid_train.drop( 'service', axis = 1, inplace = True )
nid_train.drop( 'num_outbound_cmds', axis = 1, inplace = True )

nid_train.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
nid_train.shape

(125973, 51)

In [15]:
# Separting X variables and Y variable
X = nid_train[nid_train.columns.difference(['Attack_Multiclass'])]
Y = nid_train[['Attack_Multiclass']]

# Splitting training dataset into training and testing
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.3, random_state=42)
print(train_x.shape)
print(train_y.shape)

(88181, 50)
(88181, 1)


In [16]:
#Feature Scaling
sc = StandardScaler()
train_x = sc.fit_transform(train_x)
test_x = sc.transform(test_x)

In [17]:
# Fitting KNN Classifier
model = KNeighborsClassifier(n_neighbors=5, weights='distance')
model = model.fit(train_x, train_y)

In [18]:
# Using GridSearchCV to determine best pair of parameters
parameters = [{'n_neighbors': [3,4,5,6], 'weights': ['uniform','distance'],'metric' : ['minkowski'], 'p' :[2]}]
grid_search = GridSearchCV(estimator = model,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           n_jobs = -1)
grid_search = grid_search.fit(train_x, train_y)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 99.85 %
Best Parameters: {'metric': 'minkowski', 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}


> **Fitting model again based on best parameters**

In [19]:
# Fitting KNN Classifier
model = KNeighborsClassifier(n_neighbors=3, weights='distance',metric ='minkowski', p=2)
model = model.fit(train_x, train_y)

In [20]:
train_predict = model.predict(train_x)
train_predict_probability = model.predict_proba(train_x)

In [21]:
lbl = np.array([0,1,2,3,4])
tgt = ['normal','dos','probe','r2l','u2r']
print("-------------- Classification Report ----------------------")
print(metrics.classification_report(train_y,train_predict,labels = lbl,target_names = tgt))

-------------- Classification Report ----------------------
              precision    recall  f1-score   support

      normal       1.00      1.00      1.00     47260
         dos       1.00      1.00      1.00     32102
       probe       1.00      1.00      1.00      8116
         r2l       1.00      1.00      1.00       668
         u2r       1.00      1.00      1.00        35

    accuracy                           1.00     88181
   macro avg       1.00      1.00      1.00     88181
weighted avg       1.00      1.00      1.00     88181



In [22]:
test_predict = model.predict(test_x)
test_predict_probability = model.predict_proba(test_x)

In [23]:
lbl = np.array([0,1,2,3,4])
tgt = ['normal','dos','probe','r2l','u2r']
print("-------------- Classification Report ----------------------")
print(metrics.classification_report(test_y,test_predict,labels = lbl,target_names = tgt))

-------------- Classification Report ----------------------
              precision    recall  f1-score   support

      normal       1.00      1.00      1.00     20083
         dos       1.00      1.00      1.00     13825
       probe       0.99      1.00      1.00      3540
         r2l       0.98      0.96      0.97       327
         u2r       0.75      0.53      0.62        17

    accuracy                           1.00     37792
   macro avg       0.95      0.90      0.92     37792
weighted avg       1.00      1.00      1.00     37792



### Fitting the model  to the actual Test.csv data

#### Starting with processing Test data to fit the model

In [24]:
#1. Dummyfing relevant columns
protocol_type_dummy= pd.get_dummies(nid_test['protocol_type'], prefix='protocol', drop_first=True)
flag_dummy = pd.get_dummies(nid_test['flag'], prefix='flag', drop_first=True)
nid_test_std = pd.concat([nid_test, protocol_type_dummy,flag_dummy], axis=1)

#2. Getting rid of unnecessary columns
nid_test_std.drop( 'protocol_type', axis = 1, inplace = True )
nid_test_std.drop( 'flag', axis = 1, inplace = True )
nid_test_std.drop( 'service', axis = 1, inplace = True )
nid_test_std.drop( 'num_outbound_cmds', axis = 1, inplace = True )

#4. Splitting test data into X and Y vars (since we have Y var in the dataset)
t_test_x = nid_test_std[nid_test_std.columns.difference(['Attack_Multiclass'])]
t_test_y = nid_test_std[['Attack_Multiclass']]

#5. Standardizing the dataset
t_test_x = sc.transform(t_test_x)


In [25]:
# Getting predicted data
t_test_predict = model.predict(t_test_x)
t_test_predict_probability = model.predict_proba(t_test_x)

In [26]:
#Getting test fit metrics
lbl = np.array([0,1,2,3,4])
tgt = ['normal','dos','probe','r2l','u2r']
print("-------------- Classification Report ----------------------")
print(metrics.classification_report(t_test_y,t_test_predict,labels = lbl,target_names = tgt))

-------------- Classification Report ----------------------
              precision    recall  f1-score   support

      normal       0.73      0.96      0.83     10004
         dos       0.91      0.84      0.87      7166
       probe       0.71      0.72      0.71      2421
         r2l       0.96      0.07      0.14      2885
         u2r       0.44      0.31      0.37        67

    accuracy                           0.78     22543
   macro avg       0.75      0.58      0.58     22543
weighted avg       0.81      0.78      0.74     22543



In [27]:
nid_test_final = pd.concat([nid_test, pd.DataFrame(data = t_test_predict,columns = ['Predicted_Multiclass'])], axis=1)
nid_test_final.shape

(22543, 44)

In [28]:
nid_test_final.to_csv("Test_data_output.csv")

### With 78% accuracy, we can try a different algorithm to get better accuracy.

## Support Vector Machine for multiclass Classifiction

In [29]:
nid_train = nid_train_1.copy()
protocol_type_dummy= pd.get_dummies(nid_train['protocol_type'], prefix='protocol', drop_first=True)
flag_dummy = pd.get_dummies(nid_train['flag'], prefix='flag', drop_first=True)
nid_train = pd.concat([nid_train, protocol_type_dummy,flag_dummy], axis=1)

In [31]:
nid_train.drop(['protocol_type','flag','service'],axis = 1,inplace=True)
nid_train.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [33]:
nid_train_cont = nid_train[['duration','src_bytes','dst_bytes','wrong_fragment','urgent','hot','num_failed_logins','num_compromised','num_root','num_file_creations','num_shells','num_access_files','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate']]
nid_train_cont.head()

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,num_root,num_file_creations,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,491,0,0,0,0,0,0,0,0,...,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0
1,0,146,0,0,0,0,0,0,0,0,...,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0
3,0,232,8153,0,0,0,0,0,0,0,...,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01
4,0,199,420,0,0,0,0,0,0,0,...,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
nid_train_categ = nid_train[nid_train.columns.difference(['duration','src_bytes','dst_bytes','wrong_fragment','urgent','hot','num_failed_logins','num_compromised','num_root','num_file_creations','num_shells','num_access_files','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate'])]
nid_train_categ.drop(['num_outbound_cmds'],axis = 1,inplace=True)
nid_train_categ.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 20 columns):
Attack_Multiclass    125973 non-null int64
flag_REJ             125973 non-null uint8
flag_RSTO            125973 non-null uint8
flag_RSTOS0          125973 non-null uint8
flag_RSTR            125973 non-null uint8
flag_S0              125973 non-null uint8
flag_S1              125973 non-null uint8
flag_S2              125973 non-null uint8
flag_S3              125973 non-null uint8
flag_SF              125973 non-null uint8
flag_SH              125973 non-null uint8
is_guest_login       125973 non-null int64
is_host_login        125973 non-null int64
land                 125973 non-null int64
last_flag            125973 non-null int64
logged_in            125973 non-null int64
protocol_tcp         125973 non-null uint8
protocol_udp         125973 non-null uint8
root_shell           125973 non-null int64
su_attempted         125973 non-null int64
dtypes: int64(8), uint8(12)
m

In [34]:
corr = nid_train_cont.corr()
corr.to_csv("Correlation_matrix_multiclass.csv")

In [37]:
# Based on Correlation matrix, dropping following columns:
# 'num_compromised','srv_serror_rate','srv_rerror_rate','same_srv_rate','dst_host_srv_count','dst_host_same_srv_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate'
nid_train_cont.drop(['num_compromised','srv_serror_rate','srv_rerror_rate','same_srv_rate','dst_host_srv_count','dst_host_same_srv_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate'],axis = 1,inplace = True)
nid_train_cont.head()

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_root,num_file_creations,num_shells,...,count,srv_count,serror_rate,rerror_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate
0,0,491,0,0,0,0,0,0,0,0,...,2,2,0.0,0.0,0.0,0.0,150,0.03,0.17,0.0
1,0,146,0,0,0,0,0,0,0,0,...,13,1,0.0,0.0,0.15,0.0,255,0.6,0.88,0.0
2,0,0,0,0,0,0,0,0,0,0,...,123,6,1.0,0.0,0.07,0.0,255,0.05,0.0,0.0
3,0,232,8153,0,0,0,0,0,0,0,...,5,5,0.2,0.0,0.0,0.0,30,0.0,0.03,0.04
4,0,199,420,0,0,0,0,0,0,0,...,30,32,0.0,0.0,0.0,0.09,255,0.0,0.0,0.0


In [40]:
print(nid_train_cont.shape)
print(nid_train_categ.shape)

(125973, 21)
(125973, 20)


In [41]:
nid_train_trim = pd.concat([nid_train_cont,nid_train_categ],axis=1)

In [42]:
nid_train_trim.shape

(125973, 41)

### To gather important columns
#### 1. RFE

In [45]:
X = nid_train_trim[nid_train_trim.columns.difference(['Attack_Multiclass'])]
y = nid_train_trim['Attack_Multiclass']
classifier = RandomForestClassifier()
rfe = RFE(classifier, 20)
rfe = rfe.fit(X, y )

In [46]:
imp_rfe = list(X.columns[rfe.support_])
imp_rfe

['count',
 'diff_srv_rate',
 'dst_bytes',
 'dst_host_count',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'flag_RSTR',
 'flag_S0',
 'flag_SF',
 'hot',
 'last_flag',
 'logged_in',
 'protocol_tcp',
 'protocol_udp',
 'rerror_rate',
 'serror_rate',
 'src_bytes',
 'srv_count',
 'wrong_fragment']

#### 2. Select KBest

In [47]:
SKB = SelectKBest(f_classif, k=20).fit(X, y )

In [48]:
SKB.get_support()
imp_vars_SKB = list(X.columns[SKB.get_support()])
imp_vars_SKB

['count',
 'diff_srv_rate',
 'dst_host_count',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'flag_REJ',
 'flag_RSTR',
 'flag_S0',
 'flag_SF',
 'hot',
 'is_guest_login',
 'last_flag',
 'logged_in',
 'protocol_tcp',
 'protocol_udp',
 'rerror_rate',
 'root_shell',
 'serror_rate',
 'srv_diff_host_rate']

In [49]:
Final_list = list(set(imp_vars_SKB + imp_rfe ))
Final_list

['protocol_udp',
 'dst_host_srv_diff_host_rate',
 'serror_rate',
 'src_bytes',
 'dst_host_count',
 'srv_diff_host_rate',
 'dst_host_diff_srv_rate',
 'srv_count',
 'flag_S0',
 'flag_RSTR',
 'dst_bytes',
 'logged_in',
 'dst_host_same_src_port_rate',
 'rerror_rate',
 'last_flag',
 'root_shell',
 'protocol_tcp',
 'is_guest_login',
 'hot',
 'flag_REJ',
 'wrong_fragment',
 'flag_SF',
 'count',
 'diff_srv_rate']

## Fitting `Model 1` using RFE variables 

In [50]:
nid_rfe = nid_train[imp_rfe]
nid_rfe['Attack_Multiclass']=nid_train.Attack_Multiclass
nid_rfe.shape

(125973, 21)

In [51]:
target_rfe = nid_rfe[['Attack_Multiclass']]
features_rfe = nid_rfe[nid_rfe.columns.difference(['Attack_Multiclass'])]
train_y_rfe, test_y_rfe, train_X_rfe, test_X_rfe = train_test_split(target_rfe,features_rfe, test_size=0.3, random_state=42)

In [52]:
sc = StandardScaler()
train_X_rfe = sc.fit_transform(train_X_rfe)
test_X_rfe = sc.transform(test_X_rfe)

In [53]:
classifier = SVC(kernel = 'rbf', random_state = 42)
classifier.fit(train_X_rfe, train_y_rfe)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [54]:
train_predict_rfe = classifier.predict(train_X_rfe)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------Parameters: C=1.0,kernel = 'rbf'------------")
print(classification_report(train_y_rfe, train_predict_rfe,target_names=tgt))

------------Parameters: C=1.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

      normal       1.00      1.00      1.00     47260
         dos       1.00      1.00      1.00     32102
       probe       0.98      0.99      0.99      8116
         r2l       0.94      0.94      0.94       668
         u2r       0.75      0.43      0.55        35

    accuracy                           1.00     88181
   macro avg       0.93      0.87      0.89     88181
weighted avg       1.00      1.00      1.00     88181



In [55]:
test_predict_rfe = classifier.predict(test_X_rfe)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------Parameters: C=1.0,kernel = 'rbf'------------")
print(classification_report(test_y_rfe, test_predict_rfe,target_names=tgt))

------------Parameters: C=1.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

      normal       1.00      0.99      1.00     20083
         dos       1.00      1.00      1.00     13825
       probe       0.98      0.99      0.98      3540
         r2l       0.93      0.93      0.93       327
         u2r       0.71      0.29      0.42        17

    accuracy                           0.99     37792
   macro avg       0.92      0.84      0.86     37792
weighted avg       0.99      0.99      0.99     37792



### Testing the actual Test Data using the RFE variables

In [57]:
#1. Creating the required dummy columns
protocol_type_dummy= pd.get_dummies(nid_test['protocol_type'], prefix='protocol', drop_first=True)
flag_dummy = pd.get_dummies(nid_test['flag'], prefix='flag', drop_first=True)
nid_test = pd.concat([nid_test, protocol_type_dummy,flag_dummy], axis=1)

#2. Getting rid of unnecessary columns
nid_test_trim = nid_test[imp_rfe]
nid_test_trim["Attack_Multiclass"] = nid_test["Attack_Multiclass"]

#3. Splitting test data into X and Y vars (since we have Y var in the dataset)
t_test_x_rfe = nid_test_trim[nid_test_trim.columns.difference(['Attack_Multiclass'])]
t_test_y_rfe = nid_test_trim[['Attack_Multiclass']]

#4. Standardizing the dataset
t_test_x_rfe = sc.transform(t_test_x_rfe)

#5. Fitting the Model
t_test_predict_rfe = classifier.predict(t_test_x_rfe)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------Parameters: C=1.0,kernel = 'rbf'------------")
print(classification_report(t_test_y_rfe, t_test_predict_rfe,target_names=tgt))

------------Parameters: C=1.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

      normal       0.73      0.97      0.83     10004
         dos       0.90      0.82      0.86      7166
       probe       0.67      0.75      0.71      2421
         r2l       0.67      0.02      0.04      2885
         u2r       0.44      0.24      0.31        67

    accuracy                           0.77     22543
   macro avg       0.68      0.56      0.55     22543
weighted avg       0.77      0.77      0.73     22543



In [61]:
classifier = SVC(C=15, kernel = 'linear')
classifier.fit(train_X_rfe, train_y_rfe)
train_predict_rfe = classifier.predict(train_X_rfe)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------Train Parameters: C=15.0,kernel = 'linear'------------")
print(classification_report(train_y_rfe, train_predict_rfe,target_names=tgt))

test_predict_rfe = classifier.predict(test_X_rfe)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------Test Parameters: C=15.0,kernel = 'linear'------------")
print(classification_report(test_y_rfe, test_predict_rfe,target_names=tgt))

------------Train Parameters: C=15.0,kernel = 'linear'------------
              precision    recall  f1-score   support

      normal       0.98      0.99      0.99     47260
         dos       0.99      0.99      0.99     32102
       probe       0.97      0.94      0.96      8116
         r2l       0.88      0.94      0.90       668
         u2r       0.81      0.37      0.51        35

    accuracy                           0.98     88181
   macro avg       0.93      0.85      0.87     88181
weighted avg       0.98      0.98      0.98     88181

------------Test Parameters: C=15.0,kernel = 'linear'------------
              precision    recall  f1-score   support

      normal       0.98      0.99      0.98     20083
         dos       0.99      0.99      0.99     13825
       probe       0.96      0.94      0.95      3540
         r2l       0.87      0.91      0.89       327
         u2r       0.50      0.18      0.26        17

    accuracy                           0.98     3779

In [59]:
# Fitting the Model to actual test data
t_test_predict_rfe = classifier.predict(t_test_x_rfe)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------Parameters: C=10.0,kernel = 'linear'------------")
print(classification_report(t_test_y_rfe, t_test_predict_rfe,target_names=tgt))

------------Parameters: C=10.0,kernel = 'linear'------------
              precision    recall  f1-score   support

      normal       0.78      0.95      0.85     10004
         dos       0.86      0.86      0.86      7166
       probe       0.64      0.78      0.70      2421
         r2l       0.58      0.02      0.03      2885
         u2r       0.08      0.24      0.12        67

    accuracy                           0.78     22543
   macro avg       0.59      0.57      0.51     22543
weighted avg       0.76      0.78      0.73     22543



In [62]:
# Fitting the Model to actual test data - C=15
t_test_predict_rfe = classifier.predict(t_test_x_rfe)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------Parameters: C=15.0,kernel = 'linear'------------")
print(classification_report(t_test_y_rfe, t_test_predict_rfe,target_names=tgt))

------------Parameters: C=15.0,kernel = 'linear'------------
              precision    recall  f1-score   support

      normal       0.78      0.95      0.85     10004
         dos       0.87      0.86      0.86      7166
       probe       0.64      0.78      0.70      2421
         r2l       0.58      0.02      0.03      2885
         u2r       0.08      0.24      0.12        67

    accuracy                           0.78     22543
   macro avg       0.59      0.57      0.51     22543
weighted avg       0.76      0.78      0.73     22543



In [63]:
classifier = SVC(C=3, kernel = 'rbf')
classifier.fit(train_X_rfe, train_y_rfe)
train_predict_rfe = classifier.predict(train_X_rfe)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------Train Parameters: C=3.0,kernel = 'rbf'------------")
print(classification_report(train_y_rfe, train_predict_rfe,target_names=tgt))

test_predict_rfe = classifier.predict(test_X_rfe)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------Test Parameters: C=3.0,kernel = 'rbf'------------")
print(classification_report(test_y_rfe, test_predict_rfe,target_names=tgt))

------------Train Parameters: C=3.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

      normal       1.00      1.00      1.00     47260
         dos       1.00      1.00      1.00     32102
       probe       0.99      0.99      0.99      8116
         r2l       0.96      0.96      0.96       668
         u2r       0.84      0.60      0.70        35

    accuracy                           1.00     88181
   macro avg       0.96      0.91      0.93     88181
weighted avg       1.00      1.00      1.00     88181

------------Test Parameters: C=3.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

      normal       1.00      1.00      1.00     20083
         dos       1.00      1.00      1.00     13825
       probe       0.98      0.99      0.99      3540
         r2l       0.94      0.94      0.94       327
         u2r       0.75      0.35      0.48        17

    accuracy                           1.00     37792
   mac

In [64]:
# Fitting the Model to actual test data - C=3 rbf kernel
t_test_predict_rfe = classifier.predict(t_test_x_rfe)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------Parameters: C=3.0,kernel = 'rbf'------------")
print(classification_report(t_test_y_rfe, t_test_predict_rfe,target_names=tgt))

------------Parameters: C=3.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

      normal       0.74      0.97      0.84     10004
         dos       0.89      0.82      0.86      7166
       probe       0.68      0.75      0.71      2421
         r2l       0.66      0.02      0.04      2885
         u2r       0.33      0.24      0.28        67

    accuracy                           0.78     22543
   macro avg       0.66      0.56      0.54     22543
weighted avg       0.77      0.78      0.73     22543



## `Model 1 Conclusion` :
For **RFE based variable, best classifier params: C=1,kernel='rbf'**

## Fitting `Model 2` using Select K-Best variables 

In [65]:
nid_skb = nid_train[imp_vars_SKB]
nid_skb['Attack_Multiclass']=nid_train.Attack_Multiclass
nid_skb.shape

(125973, 21)

In [66]:
target_skb = nid_skb[['Attack_Multiclass']]
features_skb = nid_skb[nid_skb.columns.difference(['Attack_Multiclass'])]
train_y_skb, test_y_skb, train_X_skb, test_X_skb = train_test_split(target_skb,features_skb, test_size=0.3, random_state=42)

In [67]:
sc = StandardScaler()
train_X_skb = sc.fit_transform(train_X_skb)
test_X_skb = sc.transform(test_X_skb)

In [68]:
classifier = SVC(kernel = 'rbf', random_state = 42)
classifier.fit(train_X_skb, train_y_skb)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [69]:
train_predict_skb = classifier.predict(train_X_skb)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------SKB Parameters: C=1.0,kernel = 'rbf'------------")
print(classification_report(train_y_skb, train_predict_skb,target_names=tgt))

------------SKB Parameters: C=1.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

      normal       1.00      0.99      1.00     47260
         dos       1.00      1.00      1.00     32102
       probe       0.98      0.98      0.98      8116
         r2l       0.94      0.94      0.94       668
         u2r       0.86      0.54      0.67        35

    accuracy                           0.99     88181
   macro avg       0.96      0.89      0.92     88181
weighted avg       0.99      0.99      0.99     88181



In [70]:
test_predict_skb = classifier.predict(test_X_skb)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------SKB Parameters: C=1.0,kernel = 'rbf'------------")
print(classification_report(test_y_skb, test_predict_skb,target_names=tgt))

------------SKB Parameters: C=1.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

      normal       1.00      0.99      0.99     20083
         dos       0.99      1.00      1.00     13825
       probe       0.98      0.99      0.98      3540
         r2l       0.92      0.92      0.92       327
         u2r       0.71      0.29      0.42        17

    accuracy                           0.99     37792
   macro avg       0.92      0.84      0.86     37792
weighted avg       0.99      0.99      0.99     37792



### Testing the actual Test Data using the RFE variables

In [71]:
#1. Culling to keep relevant columns
nid_test_trim = nid_test[imp_vars_SKB]
nid_test_trim["Attack_Multiclass"] = nid_test["Attack_Multiclass"]

#2. Splitting test data into X and Y vars (since we have Y var in the dataset)
t_test_x_skb = nid_test_trim[nid_test_trim.columns.difference(['Attack_Multiclass'])]
t_test_y_skb = nid_test_trim[['Attack_Multiclass']]

#3. Standardizing the dataset
t_test_x_skb = sc.transform(t_test_x_skb)

#4. Fitting the Model
t_test_predict_skb = classifier.predict(t_test_x_skb)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------SKB Parameters: C=1.0,kernel = 'rbf'------------")
print(classification_report(t_test_y_skb, t_test_predict_skb,target_names=tgt))

------------SKB Parameters: C=1.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

      normal       0.73      0.96      0.83     10004
         dos       0.91      0.82      0.86      7166
       probe       0.67      0.67      0.67      2421
         r2l       0.94      0.15      0.26      2885
         u2r       0.73      0.28      0.41        67

    accuracy                           0.78     22543
   macro avg       0.80      0.58      0.61     22543
weighted avg       0.81      0.78      0.75     22543



In [73]:
# Fitting the Model to actual test data - C=10 kernel= linear
classifier = SVC(C=10, kernel = 'linear')
classifier.fit(train_X_skb, train_y_skb)
train_predict_skb = classifier.predict(train_X_skb)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------SKB Train Parameters: C=10,kernel = 'linear'------------")
print(classification_report(train_y_skb, train_predict_skb,target_names=tgt))

test_predict_skb = classifier.predict(test_X_skb)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------SKB Test Parameters: C=10,kernel = 'linear'------------")
print(classification_report(test_y_skb, test_predict_skb,target_names=tgt))

------------SKB Train Parameters: C=10,kernel = 'linear'------------
              precision    recall  f1-score   support

      normal       0.98      0.98      0.98     47260
         dos       0.98      0.98      0.98     32102
       probe       0.95      0.90      0.92      8116
         r2l       0.87      0.96      0.91       668
         u2r       0.88      0.63      0.73        35

    accuracy                           0.97     88181
   macro avg       0.93      0.89      0.91     88181
weighted avg       0.97      0.97      0.97     88181

------------SKB Test Parameters: C=10,kernel = 'linear'------------
              precision    recall  f1-score   support

      normal       0.97      0.98      0.98     20083
         dos       0.98      0.98      0.98     13825
       probe       0.94      0.90      0.92      3540
         r2l       0.88      0.94      0.91       327
         u2r       0.82      0.53      0.64        17

    accuracy                           0.97     

In [74]:
# Fitting the Model to actual test data - C=10 kernel= linear
t_test_predict_skb = classifier.predict(t_test_x_skb)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------SKB Parameters: C=10,kernel = 'linear'------------")
print(classification_report(t_test_y_skb, t_test_predict_skb,target_names=tgt))

------------SKB Parameters: C=10,kernel = 'linear'------------
              precision    recall  f1-score   support

      normal       0.76      0.93      0.83     10004
         dos       0.84      0.88      0.86      7166
       probe       0.77      0.79      0.78      2421
         r2l       0.89      0.09      0.17      2885
         u2r       0.59      0.30      0.40        67

    accuracy                           0.79     22543
   macro avg       0.77      0.60      0.61     22543
weighted avg       0.80      0.79      0.75     22543



## `Model 2 Conclusion` :
For **SKB based variable, best classifier params: C=10,kernel='linear'**

## Fitting `Model 3` using both sets of best variables 

In [75]:
nid_all = nid_train[Final_list]
nid_all['Attack_Multiclass']=nid_train.Attack_Multiclass
nid_all.shape

(125973, 25)

In [77]:
target_all = nid_all[['Attack_Multiclass']]
features_all = nid_all[nid_all.columns.difference(['Attack_Multiclass'])]
train_y_all, test_y_all, train_X_all, test_X_all = train_test_split(target_all,features_all, test_size=0.3, random_state=42)

In [78]:
sc = StandardScaler()
train_X_all = sc.fit_transform(train_X_all)
test_X_all = sc.transform(test_X_all)

In [79]:
classifier = SVC(kernel = 'rbf', random_state = 42)
classifier.fit(train_X_all, train_y_all)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [80]:
train_predict_all = classifier.predict(train_X_all)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------all Parameters: C=1.0,kernel = 'rbf'------------")
print(classification_report(train_y_all, train_predict_all,target_names=tgt))

------------all Parameters: C=1.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

      normal       1.00      1.00      1.00     47260
         dos       1.00      1.00      1.00     32102
       probe       0.98      0.99      0.99      8116
         r2l       0.93      0.95      0.94       668
         u2r       0.86      0.51      0.64        35

    accuracy                           1.00     88181
   macro avg       0.95      0.89      0.91     88181
weighted avg       1.00      1.00      1.00     88181



In [81]:
test_predict_all = classifier.predict(test_X_all)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------all Parameters: C=1.0,kernel = 'rbf'------------")
print(classification_report(test_y_all, test_predict_all,target_names=tgt))

------------all Parameters: C=1.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

      normal       1.00      0.99      1.00     20083
         dos       1.00      1.00      1.00     13825
       probe       0.97      0.99      0.98      3540
         r2l       0.92      0.93      0.93       327
         u2r       0.71      0.29      0.42        17

    accuracy                           0.99     37792
   macro avg       0.92      0.84      0.86     37792
weighted avg       0.99      0.99      0.99     37792



### Testing the actual Test Data using the RFE variables

In [82]:
#1. Culling to keep relevant columns
nid_test_trim = nid_test[Final_list]
nid_test_trim["Attack_Multiclass"] = nid_test["Attack_Multiclass"]

#2. Splitting test data into X and Y vars (since we have Y var in the dataset)
t_test_x_all = nid_test_trim[nid_test_trim.columns.difference(['Attack_Multiclass'])]
t_test_y_all = nid_test_trim[['Attack_Multiclass']]

#3. Standardizing the dataset
t_test_x_all = sc.transform(t_test_x_all)

#4. Fitting the Model
t_test_predict_all = classifier.predict(t_test_x_all)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------SKB Parameters: C=1.0,kernel = 'rbf'------------")
print(classification_report(t_test_y_all, t_test_predict_all,target_names=tgt))

------------SKB Parameters: C=1.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

      normal       0.73      0.97      0.83     10004
         dos       0.96      0.82      0.89      7166
       probe       0.66      0.70      0.68      2421
         r2l       0.94      0.14      0.25      2885
         u2r       0.73      0.28      0.41        67

    accuracy                           0.79     22543
   macro avg       0.80      0.58      0.61     22543
weighted avg       0.82      0.79      0.76     22543



In [83]:
# Fitting the Model to actual test data - C=10 kernel= linear
classifier = SVC(C=10, kernel = 'linear')
classifier.fit(train_X_all, train_y_all)
train_predict_all = classifier.predict(train_X_all)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------all Train Parameters: C=10,kernel = 'linear'------------")
print(classification_report(train_y_all, train_predict_all,target_names=tgt))

test_predict_all = classifier.predict(test_X_all)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------all Test Parameters: C=10,kernel = 'linear'------------")
print(classification_report(test_y_all, test_predict_all,target_names=tgt))

------------all Train Parameters: C=10,kernel = 'linear'------------
              precision    recall  f1-score   support

      normal       0.99      0.99      0.99     47260
         dos       0.99      1.00      1.00     32102
       probe       0.97      0.94      0.95      8116
         r2l       0.90      0.94      0.92       668
         u2r       0.88      0.63      0.73        35

    accuracy                           0.99     88181
   macro avg       0.94      0.90      0.92     88181
weighted avg       0.99      0.99      0.99     88181

------------all Test Parameters: C=10,kernel = 'linear'------------
              precision    recall  f1-score   support

      normal       0.99      0.99      0.99     20083
         dos       0.99      1.00      1.00     13825
       probe       0.96      0.94      0.95      3540
         r2l       0.89      0.93      0.91       327
         u2r       0.82      0.53      0.64        17

    accuracy                           0.99     

In [84]:
# Fitting the Model to actual test data - C=10 kernel= linear
t_test_predict_all = classifier.predict(t_test_x_all)
tgt = ['normal','dos','probe','r2l','u2r']
print("------------all Parameters: C=10,kernel = 'linear'------------")
print(classification_report(t_test_y_all, t_test_predict_all,target_names=tgt))

------------all Parameters: C=10,kernel = 'linear'------------
              precision    recall  f1-score   support

      normal       0.77      0.96      0.85     10004
         dos       0.95      0.89      0.92      7166
       probe       0.67      0.82      0.74      2421
         r2l       0.92      0.10      0.18      2885
         u2r       0.59      0.30      0.40        67

    accuracy                           0.81     22543
   macro avg       0.78      0.61      0.62     22543
weighted avg       0.83      0.81      0.77     22543



# Conclusion:

**_Best Algo_:** SVC<br>
**_Best Params_:** C=10;kernel=linear<br>
**_Best predictive variables_:** 'protocol_udp','dst_host_srv_diff_host_rate','serror_rate','src_bytes','dst_host_count','srv_diff_host_rate','dst_host_diff_srv_rate','srv_count','flag_S0','flag_RSTR','dst_bytes','logged_in','dst_host_same_src_port_rate','rerror_rate','last_flag','root_shell','protocol_tcp','is_guest_login','hot','flag_REJ','wrong_fragment','flag_SF','count','diff_srv_rate'

## GridSearch CV

I ran the GridSearchCV once. It took me over 12 hours to run the below piece of GridSearch. Hence I have not ran it again and have instead taken an approximation for trial and error above

In [39]:
classifier = SVC(kernel = 'rbf', random_state = 42)
classifier.fit(train_X, train_y)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [40]:
train_predict = classifier.predict(train_X)
print("------------Parameters: C=1.0,kernel = 'rbf'------------")
print(classification_report(train_y, train_predict))

------------Parameters: C=1.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     47260
           1       0.99      0.99      0.99     32102
           2       0.98      0.89      0.93      8116
           3       0.88      0.72      0.79       668
           4       0.00      0.00      0.00        35

    accuracy                           0.98     88181
   macro avg       0.76      0.72      0.74     88181
weighted avg       0.98      0.98      0.98     88181



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
test_predict = classifier.predict(test_X)
print("------------Test prediction: Parameters: C=1.0,kernel = 'rbf'------------")
print(classification_report(test_y, test_predict))

------------Test prediction: Parameters: C=1.0,kernel = 'rbf'------------
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     20083
           1       0.99      0.99      0.99     13825
           2       0.98      0.89      0.93      3540
           3       0.89      0.74      0.81       327
           4       0.00      0.00      0.00        17

    accuracy                           0.98     37792
   macro avg       0.76      0.72      0.74     37792
weighted avg       0.98      0.98      0.98     37792



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
# Running GridSearchCV to determine the best tuning parameter for the SVM classifier
parameters = [{'C': [10, 50, 100], 'kernel': ['linear']}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           n_jobs = -1)

In [None]:
grid_search = grid_search.fit(train_X, train_y)

In [29]:
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 100.00 %
Best Parameters: {'C': 10, 'kernel': 'linear'}
