In [1]:
import os
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.model_selection import cross_val_score
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

DATA_DIR = '../local_data/features/'

X = np.load(os.path.join(DATA_DIR, 'features.npy'))
y = np.load(os.path.join(DATA_DIR, 'labels.npy'))[0]

In [2]:
def classify(clf, n_fold=False):
    if n_fold == True:
        scores = cross_val_score(clf, X, y, cv=3)
        print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean()*100, scores.std() * 200))
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

        # Fit the model
        clf.fit(X_train, y_train)

        # Perform the predictions
        y_predicted = clf.predict(X_test)

        from sklearn.metrics import accuracy_score
        print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))

        from sklearn.metrics import classification_report
        print("Classification Report \n{}".format(classification_report(y_test, y_predicted)))

In [3]:
%%time
## Decision Tree Classifier w/ Max Depth of 3
clf = tree.DecisionTreeClassifier(max_depth=3)
classify(clf)
print("3 Fold CV")
classify(clf, True)

Accuracy = 98.18927816340519 %


  'precision', 'predicted', average, warn_for)


Classification Report 
                 precision    recall  f1-score   support

         Normal       0.99      0.99      0.99    500738
    brute_force       0.00      0.00      0.00       550
           ddos       0.00      0.00      0.00       908
       internal       0.49      0.59      0.54      5032
irc_botnet_ddos       0.92      0.90      0.91      9380
ssh_brute_force       0.00      0.00      0.00      1307

    avg / total       0.98      0.98      0.98    517915

3 Fold CV
Accuracy: 97.8036 (+/- 0.6133)
CPU times: user 30 s, sys: 1.86 s, total: 31.9 s
Wall time: 31.9 s


In [4]:
%%time
## Decision Tree Classifier w/ no Max Depth

clf = tree.DecisionTreeClassifier()
classify(clf)
print("3 Fold CV")
classify(clf, True)

Accuracy = 99.37634553932595 %
Classification Report 
                 precision    recall  f1-score   support

         Normal       1.00      1.00      1.00    500741
    brute_force       0.99      0.98      0.98       506
           ddos       0.90      0.51      0.65       963
       internal       0.82      0.73      0.77      5065
irc_botnet_ddos       0.96      0.95      0.96      9357
ssh_brute_force       0.98      0.98      0.98      1283

    avg / total       0.99      0.99      0.99    517915

3 Fold CV
Accuracy: 98.7920 (+/- 0.4264)
CPU times: user 1min 14s, sys: 1.89 s, total: 1min 15s
Wall time: 1min 15s


In [5]:
%%time
## Random Forests 5,10,25,100 Trees Classifier

num_trees = [5, 10, 25, 100]
for n in num_trees:
    print("N = {} trees".format(n))
    clf = RandomForestClassifier(n_estimators=n, n_jobs=-1)
    classify(clf)
    print("3 Fold CV")
    classify(clf, True)

N = 5 trees
Accuracy = 99.38291032312252 %
Classification Report 
                 precision    recall  f1-score   support

         Normal       1.00      1.00      1.00    500685
    brute_force       0.98      0.99      0.99       564
           ddos       0.93      0.50      0.65       925
       internal       0.89      0.67      0.76      5110
irc_botnet_ddos       0.96      0.95      0.96      9322
ssh_brute_force       0.99      0.97      0.98      1309

    avg / total       0.99      0.99      0.99    517915

3 Fold CV
Accuracy: 98.8497 (+/- 0.4202)
N = 10 trees
Accuracy = 99.41785814274543 %
Classification Report 
                 precision    recall  f1-score   support

         Normal       1.00      1.00      1.00    500734
    brute_force       0.98      0.99      0.98       528
           ddos       0.93      0.51      0.66       939
       internal       0.83      0.74      0.78      5069
irc_botnet_ddos       0.97      0.96      0.96      9327
ssh_brute_force       0.

In [6]:
%%time
## Naive Bayes

clf = MultinomialNB()
classify(clf)
print("3 Fold CV")
classify(clf, True)

Accuracy = 16.728613768668605 %
Classification Report 
                 precision    recall  f1-score   support

         Normal       1.00      0.14      0.25    500640
    brute_force       0.00      0.95      0.01       520
           ddos       0.00      0.00      0.00       901
       internal       0.06      0.89      0.11      5113
irc_botnet_ddos       0.09      0.99      0.17      9422
ssh_brute_force       0.01      0.29      0.01      1319

    avg / total       0.97      0.17      0.25    517915

3 Fold CV
Accuracy: 25.8579 (+/- 14.3068)
CPU times: user 28 s, sys: 2.44 s, total: 30.5 s
Wall time: 30.5 s
