In [7]:
import os
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

DATA_DIR = '../local_data/features/'

In [4]:
%%time
## Decision Tree Classifier w/ Max Depth of 3

X = np.load(os.path.join(DATA_DIR, 'features.npy'))
y = np.load(os.path.join(DATA_DIR, 'labels.npy'))[0]

clf = tree.DecisionTreeClassifier(max_depth=3)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Fit the model
clf.fit(X_train, y_train)

# Perform the predictions
y_predicted = clf.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))

from sklearn.metrics import classification_report
print("Classification Report \n{}".format(classification_report(y_test, y_predicted)))

Accuracy = 98.1848372802487 %
Classification Report 
                 precision    recall  f1-score   support

         Normal       0.99      0.99      0.99    500751
    brute_force       0.00      0.00      0.00       548
           ddos       0.00      0.00      0.00       900
       internal       0.50      0.60      0.54      5101
irc_botnet_ddos       0.92      0.89      0.91      9279
ssh_brute_force       0.00      0.00      0.00      1336

    avg / total       0.98      0.98      0.98    517915

CPU times: user 4.92 s, sys: 448 ms, total: 5.36 s
Wall time: 5.37 s


  'precision', 'predicted', average, warn_for)


In [5]:
%%time
## Decision Tree Classifier w/ no Max Depth

X = np.load(os.path.join(DATA_DIR, 'features.npy'))
y = np.load(os.path.join(DATA_DIR, 'labels.npy'))[0]

clf = tree.DecisionTreeClassifier()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Fit the model
clf.fit(X_train, y_train)

# Perform the predictions
y_predicted = clf.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))

from sklearn.metrics import classification_report
print("Classification Report \n{}".format(classification_report(y_test, y_predicted)))

Accuracy = 99.37306314742767 %
Classification Report 
                 precision    recall  f1-score   support

         Normal       1.00      1.00      1.00    500588
    brute_force       0.97      0.99      0.98       564
           ddos       0.93      0.53      0.68       974
       internal       0.83      0.72      0.77      5085
irc_botnet_ddos       0.96      0.95      0.96      9393
ssh_brute_force       0.98      0.98      0.98      1311

    avg / total       0.99      0.99      0.99    517915

CPU times: user 10.8 s, sys: 488 ms, total: 11.3 s
Wall time: 11.3 s


In [6]:
%%time
## Random Forests 5,10,25,100 Trees Classifier

X = np.load(os.path.join(DATA_DIR, 'features.npy'))
y = np.load(os.path.join(DATA_DIR, 'labels.npy'))[0]

n = 0
num_trees = [5, 10, 25, 100]
while n < 4:
    print("N = ", n)
    clf = RandomForestClassifier(n_estimators=num_trees[n], n_jobs=-1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    # Fit the model
    clf.fit(X_train, y_train)

    # Perform the predictions
    y_predicted = clf.predict(X_test)

    from sklearn.metrics import accuracy_score
    print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))

    from sklearn.metrics import classification_report
    print("Classification Report \n{}".format(classification_report(y_test, y_predicted)))
    
    n = n+1

N =  0
Accuracy = 99.39758454572662 %
Classification Report 
                 precision    recall  f1-score   support

         Normal       1.00      1.00      1.00    500537
    brute_force       1.00      0.98      0.99       517
           ddos       0.93      0.52      0.67       934
       internal       0.84      0.72      0.78      5186
irc_botnet_ddos       0.96      0.96      0.96      9461
ssh_brute_force       0.99      0.97      0.98      1280

    avg / total       0.99      0.99      0.99    517915

N =  1
Accuracy = 99.41496191459989 %
Classification Report 
                 precision    recall  f1-score   support

         Normal       1.00      1.00      1.00    500956
    brute_force       0.98      0.99      0.98       525
           ddos       0.94      0.49      0.65       938
       internal       0.83      0.73      0.78      5025
irc_botnet_ddos       0.97      0.95      0.96      9191
ssh_brute_force       1.00      0.97      0.99      1280

    avg / total   

In [14]:
%%time
## Naive Bayes

X = np.load(os.path.join(DATA_DIR, 'features.npy'))
y = np.load(os.path.join(DATA_DIR, 'labels.npy'))[0]

clf = GaussianNB()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Fit the model
clf.fit(X_train, y_train)

# Perform the predictions
y_predicted = clf.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))

from sklearn.metrics import classification_report
print("Classification Report \n{}".format(classification_report(y_test, y_predicted)))

Accuracy = 5.9252966220325725 %
Classification Report 
                 precision    recall  f1-score   support

         Normal       1.00      0.04      0.08    500710
    brute_force       0.46      0.98      0.62       518
           ddos       0.00      0.00      0.00       969
       internal       0.00      0.00      0.00      5086
irc_botnet_ddos       0.36      0.99      0.53      9292
ssh_brute_force       0.00      0.82      0.01      1340

    avg / total       0.97      0.06      0.08    517915

CPU times: user 3.6 s, sys: 188 ms, total: 3.79 s
Wall time: 3.79 s
