# Naive Bayes Classifier 
# Data preparation
## Reading .csv file into Data Frame

In [1]:
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split,KFold, cross_val_score,cross_val_predict,RandomizedSearchCV,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import export_graphviz
from statistics import mean
from subprocess import call
from IPython.display import Image

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import time

pd.options.mode.chained_assignment = None

col_names = ['Time', 'Source', 'Destination', 'Protocol', 'Length', 'Info','src_port','dst_port']
data_frame = pd.read_csv("Samples/smallTraffic-_backup.csv",encoding = "ISO-8859-1", header=0, names=col_names)

data_frame.head(20)

Unnamed: 0,Time,Source,Destination,Protocol,Length,Info,src_port,dst_port
1,0.0,192.168.3.131,72.14.213.138,HTTP,997,GET /complete/search?client=chrome&hl=en-US&q=...,57011.0,80.0
2,0.029841,72.14.213.138,192.168.3.131,HTTP,440,HTTP/1.1 200 OK (text/javascript),80.0,57011.0
3,0.223883,192.168.3.131,72.14.213.102,TCP,66,55950 > 80 [SYN] Seq=0 Win=8192 Len=0 MSS=14...,55950.0,80.0
4,0.229423,192.168.3.131,72.14.213.138,TCP,54,57011 > 80 [ACK] Seq=944 Ack=387 Win=16192 L...,57011.0,80.0
5,0.242649,72.14.213.102,192.168.3.131,TCP,66,"80 > 55950 [SYN, ACK] Seq=0 Ack=1 Win=5720 L...",80.0,55950.0
6,0.242696,192.168.3.131,72.14.213.102,TCP,54,55950 > 80 [ACK] Seq=1 Ack=1 Win=65780 Len=0,55950.0,80.0
7,0.24305,192.168.3.131,72.14.213.102,HTTP,998,GET /complete/search?client=chrome&hl=en-US&q=...,55950.0,80.0
8,0.262981,192.168.3.131,72.14.213.102,TCP,54,"55950 > 80 [FIN, ACK] Seq=945 Ack=1 Win=6578...",55950.0,80.0
9,0.264074,72.14.213.102,192.168.3.131,TCP,60,80 > 55950 [ACK] Seq=1 Ack=945 Win=7616 Len=0,80.0,55950.0
10,0.276671,72.14.213.102,192.168.3.131,HTTP,541,HTTP/1.1 200 OK (text/javascript),80.0,55950.0


# Data cleaning
## Set source port and destination port of ICMP to (-1) becouse of NaN occurance

In [5]:
icmp = ['ICMP']
temp_df = data_frame['Protocol'].copy(deep=True)
data_frame.src_port[temp_df.isin(icmp)] = -1 #0
data_frame.dst_port[temp_df.isin(icmp)] = -1 #0

data_frame.loc[(data_frame['src_port'] == -1) & (data_frame['dst_port'] == -1)].head(10)

Unnamed: 0,Time,Source,Destination,Protocol,Length,Info,src_port,dst_port
5960,116.676994,10.0.2.2,10.0.2.15,ICMP,202,Time-to-live exceeded (Time to live exceeded i...,-1.0,-1.0
5961,116.67706,10.0.2.2,10.0.2.15,ICMP,203,Time-to-live exceeded (Time to live exceeded i...,-1.0,-1.0
6035,117.265243,10.0.2.2,10.0.2.15,ICMP,202,Time-to-live exceeded (Time to live exceeded i...,-1.0,-1.0
6036,117.265322,10.0.2.2,10.0.2.15,ICMP,203,Time-to-live exceeded (Time to live exceeded i...,-1.0,-1.0
6133,117.812397,10.0.2.2,10.0.2.15,ICMP,202,Time-to-live exceeded (Time to live exceeded i...,-1.0,-1.0
6134,117.812489,10.0.2.2,10.0.2.15,ICMP,203,Time-to-live exceeded (Time to live exceeded i...,-1.0,-1.0
6142,118.248598,10.0.2.2,10.0.2.15,ICMP,202,Time-to-live exceeded (Time to live exceeded i...,-1.0,-1.0
6143,118.248695,10.0.2.2,10.0.2.15,ICMP,203,Time-to-live exceeded (Time to live exceeded i...,-1.0,-1.0
6164,118.702913,10.0.2.2,10.0.2.15,ICMP,202,Time-to-live exceeded (Time to live exceeded i...,-1.0,-1.0
6165,118.702988,10.0.2.2,10.0.2.15,ICMP,203,Time-to-live exceeded (Time to live exceeded i...,-1.0,-1.0


# Each ICMP packet has been succesfully changed

In [6]:
data_frame.loc[(data_frame['src_port'] != -1) & (data_frame['dst_port'] != -1) & (data_frame['Protocol'] == "ICMP")]

Unnamed: 0,Time,Source,Destination,Protocol,Length,Info,src_port,dst_port


# Data encoding
## We simplify our data from complex types into simple numbers
## Example:  "HTTP" -> 3
## This operation will speed up our algorithms

In [7]:
label_enc = preprocessing.LabelEncoder()

src_encoded=label_enc.fit_transform(data_frame['Source'])
dst_encoded=label_enc.fit_transform(data_frame['Destination'])

src_port_encoded=label_enc.fit_transform(data_frame['src_port'])
dst_port_encoded=label_enc.fit_transform(data_frame['dst_port'])

len_encoded = label_enc.fit_transform(data_frame['Length'])

protocol_encoded=label_enc.fit_transform(data_frame['Protocol'])

features=list(zip(src_encoded,dst_encoded,src_port_encoded,dst_port_encoded,len_encoded))

# Naive Bayes - Traditional training

In [18]:
start_time = time.time()

X_train, X_test, y_train, y_test = train_test_split(features, protocol_encoded, test_size=0.3)

gnb = GaussianNB()

gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Balanced_accuracy:",metrics.balanced_accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test,y_pred,average='weighted',zero_division=1))
print("F-measure:",metrics.f1_score(y_test,y_pred,average='weighted',zero_division=1))
print("Mean_absolute_error:",metrics.mean_absolute_error(y_test,y_pred))

print("\n\nTime: %s seconds" % (time.time() - start_time))

Accuracy: 0.7618602477214302
Balanced_accuracy: 0.6443705548581198
Precision: 0.8160229027192167
F-measure: 0.675992722194538
Mean_absolute_error: 1.460855340032718


Time: 0.03699970245361328 seconds


# Naive Bayes - Cross validation training

In [17]:
start_time = time.time()

X = features
y = protocol_encoded
k_fold = KFold(n_splits=10)

gaussClf = GaussianNB()
gaussClf.fit(X, y)

our_cross_valid_score = cross_val_score(gaussClf, X, y, cv=k_fold, n_jobs=1,scoring='accuracy')
y_predict = cross_val_predict(gaussClf, X, y, cv=k_fold, n_jobs=1)

print("NAIVE BAYES CLASSIFIER")
print(our_cross_valid_score)
print("\nmean : ",mean(our_cross_valid_score))
print("accuracy:",metrics.accuracy_score(y, y_predict))
print("balanced_accuracy:",metrics.balanced_accuracy_score(y, y_predict))
print("standard deviation:",np.std(our_cross_valid_score))
print("precision:",metrics.precision_score(y,y_predict,average='weighted',zero_division=1))
print("f-measure:",metrics.f1_score(y,y_predict,average='weighted',zero_division=1))
print("mean_absolute_error:",metrics.mean_absolute_error(y,y_predict))

print("\n\nTime: %s seconds" % (time.time() - start_time))

NAIVE BAYES CLASSIFIER
[0.6923616  0.87377279 0.83099579 0.63744741 0.62552595 0.76367461
 0.83730715 0.84221599 0.79242637 0.63604488]

mean :  0.7531772537448977
accuracy: 0.7531729892714396
balanced_accuracy: 0.4862839659738398
standard deviation: 0.09184891025833479
precision: 0.6883585113304798
f-measure: 0.668959762141944
mean_absolute_error: 1.502068578641049


Time: 0.5084240436553955 seconds
