# Purpose

Attempt to perform an adaboost classifier on the cves/metasploit dataset

In [44]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, fbeta_score, classification_report
from sklearn.grid_search import GridSearchCV

from sklearn.metrics import fbeta_score, make_scorer


%matplotlib inline


In [3]:
CVES_METASPLOIT_ENCODED = '../../data/processed/cves_metasplot_encoded.json'

In [4]:
cves_df = pd.read_json(CVES_METASPLOIT_ENCODED)
cves_df.head()

Unnamed: 0,access_ADJACENT_NETWORK,access_LOCAL,access_NETWORK,access_PHYSICAL,auth_HIGH,auth_LOW,auth_MULTIPLE,auth_NONE,auth_SINGLE,avail_COMPLETE,...,conf_LOW,conf_NONE,conf_PARTIAL,id,integ_COMPLETE,integ_HIGH,integ_LOW,integ_NONE,integ_PARTIAL,metasploit
0,0,0,1,0,0,0,0,1,0,0,...,0,1,0,CVE-1999-0001,0,0,0,1,0,0
1,0,0,1,0,0,0,0,1,0,1,...,0,0,0,CVE-1999-0002,1,0,0,0,0,0
2,0,0,1,0,0,0,0,1,0,1,...,0,0,0,CVE-1999-0003,1,0,0,0,0,0
3,0,0,1,0,0,0,0,1,0,0,...,0,1,0,CVE-1999-0004,0,0,0,1,0,0
4,0,0,1,0,0,0,0,1,0,1,...,0,0,0,CVE-1999-0005,1,0,0,0,0,0


In [5]:
X = cves_df.drop(['metasploit', 'id'], axis=1)
y = cves_df['metasploit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [14]:
y_train.mean()

0.01563269006605362

In [15]:
y_test.mean()

0.01670746396238878

In [26]:
model = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth=20), n_estimators = 100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

display(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
print("Training Accuracy:", model.score(X_train, y_train))
print("Test Accuracy    :", model.score(X_test, y_test))
print("F10 Score        :", fbeta_score(y_test, y_pred, 10))

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,25306,1,25307
1,430,0,430
All,25736,1,25737


Training Accuracy: 0.9843673099339464
Test Accuracy    : 0.9832536814702568
F10 Score        : 0.0


Fighting an imbalance problem here, going to keep above for notes and keep experimenting below

In [41]:
n_estimators = [x for x in range(1, 101)]

param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "n_estimators": n_estimators
             }

model = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(
    random_state = 11, 
    max_features = "auto", 
    class_weight = "balanced",
    max_depth = None))

# run grid search
grid_search_model = GridSearchCV(model, param_grid=param_grid, scoring = 'roc_auc', verbose=1)

grid_search_model.fit(X_train, y_train)
print(grid_search_model)
# summarize the results of the grid search
print(grid_search_model.best_score_)

Fitting 3 folds for each of 400 candidates, totalling 1200 fits


[Parallel(n_jobs=1)]: Done 1200 out of 1200 | elapsed: 27.1min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=11,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'base_estimator__criterion': ['gini', 'entropy'], 'base_estimator__splitter': ['best', 'random'], 'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46...6, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]}

AttributeError: 'AdaBoostClassifier' object has no attribute 'alpha'

In [42]:
model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=11,
            splitter='best'))

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

display(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
print("Training Accuracy:", model.score(X_train, y_train))
print("Test Accuracy    :", model.score(X_test, y_test))
print("F10 Score        :", fbeta_score(y_test, y_pred, 10))



Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,16150,9157,25307
1,108,322,430
All,16258,9479,25737


Training Accuracy: 0.6380391141043906
Test Accuracy    : 0.6400124334615535
F10 Score        : 0.6197145524876617


In [45]:
n_estimators = [x for x in range(1, 101)]

param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "n_estimators": n_estimators
             }


model = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(
    random_state = 11, 
    max_features = "auto", 
    class_weight = "balanced",
    max_depth = None))

f10_scorer = make_scorer(fbeta_score, beta=10)
grid_search_model = GridSearchCV(model, param_grid=param_grid, scoring = f10_scorer, verbose=1)

grid_search_model.fit(X_train, y_train)
print(grid_search_model)
# summarize the results of the grid search
print(grid_search_model.best_score_)

Fitting 3 folds for each of 400 candidates, totalling 1200 fits


[Parallel(n_jobs=1)]: Done 1200 out of 1200 | elapsed: 28.1min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=11,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'base_estimator__criterion': ['gini', 'entropy'], 'base_estimator__splitter': ['best', 'random'], 'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46...6, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]}

In [46]:
model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=11,
            splitter='best'))

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

display(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
print("Training Accuracy:", model.score(X_train, y_train))
print("Test Accuracy    :", model.score(X_test, y_test))
print("F10 Score        :", fbeta_score(y_test, y_pred, 10))


Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,16150,9157,25307
1,108,322,430
All,16258,9479,25737


Training Accuracy: 0.6380391141043906
Test Accuracy    : 0.6400124334615535
F10 Score        : 0.6197145524876617
