In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import joblib

In [2]:
data=pd.read_csv("datasets/Colcom2018CleanedScaled.csv",header=0,sep=';')

In [3]:
data.shape




(7832, 11)

In [4]:
data


Unnamed: 0,tcp_packets,dist_port_tcp,external_ips,vulume_bytes,udp_packets,source_app_packets,remote_app_packets,source_app_bytes,remote_app_bytes,dns_query_times,type
0,0.127907,6.0,0.333333,0.036782,0.0,0.100000,0.105882,0.041652,0.029865,0.00,benign
1,1.069767,0.0,2.333333,1.786894,0.0,1.088889,0.976471,0.897275,1.787684,2.00,benign
2,1.988372,0.0,1.333333,1.843764,0.0,1.944444,2.235294,6.465984,1.831939,1.50,benign
3,-0.220930,0.0,-0.333333,-0.233015,0.0,-0.255556,-0.211765,-0.131552,-0.245310,-0.50,benign
4,-0.220930,0.0,-0.333333,-0.233640,0.0,-0.255556,-0.211765,-0.131552,-0.245919,-0.50,benign
...,...,...,...,...,...,...,...,...,...,...,...
7827,-0.290698,0.0,-0.666667,-0.312383,0.0,-0.311111,-0.258824,-0.154290,-0.317647,-0.25,malicious
7828,-0.244186,4.0,-0.333333,-0.285957,0.0,-0.277778,-0.270588,-0.161208,-0.296868,-0.50,malicious
7829,-0.290698,0.0,-0.666667,-0.312383,0.0,-0.311111,-0.258824,-0.154290,-0.317647,-0.25,malicious
7830,-0.290698,0.0,-0.666667,-0.312383,0.0,-0.311111,-0.258824,-0.154290,-0.317647,-0.25,malicious


In [5]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,0:10], data['type'], test_size=0.25, random_state=45)

In [6]:
X_test.shape


(1958, 10)

In [7]:
y_test.shape

(1958,)

## Random forest

In [8]:
rdF=RandomForestClassifier(n_estimators=250, max_depth=50,random_state=45)
rdF.fit(X_train,y_train)
pred=rdF.predict(X_test)
y_score=rdF.predict(X_test)
#cm=confusion_matrix(y_test, pred)

accuracy = accuracy_score(y_test,pred)
print(rdF)
print(accuracy)
print(classification_report(y_test,pred, labels=None))

#print(cm)

RandomForestClassifier(max_depth=50, n_estimators=250, random_state=45)
0.9172625127681308
              precision    recall  f1-score   support

      benign       0.93      0.94      0.93      1190
   malicious       0.90      0.88      0.89       768

    accuracy                           0.92      1958
   macro avg       0.91      0.91      0.91      1958
weighted avg       0.92      0.92      0.92      1958



In [9]:
#joblib.dump(rdF, 'rfDefense2021.sav')

## Naive bayes

In [15]:
# Naive Bayes algorithm
gnb = GaussianNB()
gnb.fit(X_train, y_train)
pred = gnb.predict(X_test)
## accuracy
accuracy = accuracy_score(y_test,pred)
print(gnb)
print(accuracy)
print(classification_report(y_test,pred, labels=None))

GaussianNB()
0.44688457609805926
              precision    recall  f1-score   support

      benign       0.81      0.12      0.20      1190
   malicious       0.41      0.96      0.58       768

    accuracy                           0.45      1958
   macro avg       0.61      0.54      0.39      1958
weighted avg       0.66      0.45      0.35      1958



## Decision tree

In [16]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
# accuracy
accuracy = accuracy_score(y_test,pred)
print(clf)
print(accuracy)
print(classification_report(y_test,pred, labels=None))

DecisionTreeClassifier()
0.8774259448416751
              precision    recall  f1-score   support

      benign       0.90      0.90      0.90      1190
   malicious       0.85      0.84      0.84       768

    accuracy                           0.88      1958
   macro avg       0.87      0.87      0.87      1958
weighted avg       0.88      0.88      0.88      1958



## K neighbors

In [18]:
neigh = KNeighborsClassifier(n_neighbors=2)
neigh_two = KNeighborsClassifier(n_neighbors=3)
neigh_three = KNeighborsClassifier(n_neighbors=4)
neigh_four = KNeighborsClassifier(n_neighbors=6)
neigh_12 = KNeighborsClassifier(n_neighbors=12)
# fit
neigh.fit(X_train, y_train)
neigh_two.fit(X_train, y_train)
neigh_three.fit(X_train, y_train)
neigh_four.fit(X_train, y_train)
neigh_12.fit(X_train, y_train)
# prediction
pred = neigh.predict(X_test)
pred_two = neigh_two.predict(X_test)
pred_three = neigh_three.predict(X_test)
pred_four = neigh_four.predict(X_test)
pred_12= neigh_12.predict(X_test)
# accuracy
accuracy = accuracy_score(y_test,pred)
accuracy_two = accuracy_score(y_test,pred_two)
accuracy_three = accuracy_score(y_test,pred_three)
accuracy_four = accuracy_score(y_test,pred_four)
accuracy_12 = accuracy_score(y_test,pred_12)
print("with 2 neighbors accuracy: ")
print(accuracy)
print("with 3 neighbors accuracy: ")
print(accuracy_two)
print("with 4 neighbors accuracy: ")
print(accuracy_three)
print(classification_report(y_test,pred_three, labels=None))
print("with 6 neighbors accuracy: ")
print(accuracy_four)
print("with 12 neighbors accuracy: ")
print(accuracy_12)

with 2 neighbors accuracy: 
0.8912155260469867
with 3 neighbors accuracy: 
0.8871297242083759
with 4 neighbors accuracy: 
0.8922369765066395
              precision    recall  f1-score   support

      benign       0.89      0.93      0.91      1190
   malicious       0.89      0.83      0.86       768

    accuracy                           0.89      1958
   macro avg       0.89      0.88      0.89      1958
weighted avg       0.89      0.89      0.89      1958

with 6 neighbors accuracy: 
0.8784473953013279
with 12 neighbors accuracy: 
0.8615934627170582


## Support vector machine

In [19]:
Sv=svm.SVC(gamma='scale',kernel='rbf')
# fit
Sv.fit(X_train, y_train)
# prediction
pred = Sv.predict(X_test)
# accuracy
accuracy = accuracy_score(y_test,pred)
print(Sv)
print(accuracy)
print(classification_report(y_test,pred, labels=None))

SVC()
0.627170582226762
              precision    recall  f1-score   support

      benign       0.62      1.00      0.76      1190
   malicious       0.90      0.06      0.11       768

    accuracy                           0.63      1958
   macro avg       0.76      0.53      0.43      1958
weighted avg       0.73      0.63      0.51      1958



## Logistic regression

In [20]:
logisticRegr = LogisticRegression(max_iter=1000)
logisticRegr.fit(X_train, y_train)
pred=logisticRegr.predict(X_test)
accuracy = accuracy_score(y_test,pred)
print(logisticRegr)
print(accuracy)
print(classification_report(y_test,pred, labels=None))

LogisticRegression(max_iter=1000)
0.7063329928498467
              precision    recall  f1-score   support

      benign       0.72      0.86      0.78      1190
   malicious       0.68      0.47      0.56       768

    accuracy                           0.71      1958
   macro avg       0.70      0.66      0.67      1958
weighted avg       0.70      0.71      0.69      1958

