# Purpose

Attempt to perform an adaboost classifier on the cves/metasploit dataset

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np

from sklearn.model_selection import cross_val_score  
from sklearn.cross_validation import train_test_split

from sklearn.metrics import confusion_matrix, fbeta_score, classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import fbeta_score, make_scorer


%matplotlib inline




In [2]:
CVES_METASPLOIT_ENCODED = '../../data/processed/cves_metasplot_encoded.json'

In [3]:
cves_df = pd.read_json(CVES_METASPLOIT_ENCODED)
cves_df.head()

Unnamed: 0,access_ADJACENT_NETWORK,access_LOCAL,access_NETWORK,access_PHYSICAL,auth_HIGH,auth_LOW,auth_MULTIPLE,auth_NONE,auth_SINGLE,avail_COMPLETE,...,conf_LOW,conf_NONE,conf_PARTIAL,id,integ_COMPLETE,integ_HIGH,integ_LOW,integ_NONE,integ_PARTIAL,metasploit
0,0,0,1,0,0,0,0,1,0,0,...,0,1,0,CVE-1999-0001,0,0,0,1,0,0
1,0,0,1,0,0,0,0,1,0,1,...,0,0,0,CVE-1999-0002,1,0,0,0,0,0
2,0,0,1,0,0,0,0,1,0,1,...,0,0,0,CVE-1999-0003,1,0,0,0,0,0
3,0,0,1,0,0,0,0,1,0,0,...,0,1,0,CVE-1999-0004,0,0,0,1,0,0
4,0,0,1,0,0,0,0,1,0,1,...,0,0,0,CVE-1999-0005,1,0,0,0,0,0


In [4]:
X = cves_df.drop(['metasploit', 'id'], axis=1)
y = cves_df['metasploit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [9]:
param_grid = [
      {'C': [1, 10, 100, 1000], 'degree': [1, 2, 4, 8, 10], 'kernel': ['poly']},
      {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
      {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]

# Using balanced since we have a 60:1 of negatives:positives
model = SVC(class_weight='balanced')
f10_scorer = make_scorer(fbeta_score, beta=10)

grid_search_model = GridSearchCV(model, param_grid=param_grid, scoring=f10_scorer, cv=5, n_jobs=-1)

grid_search_model.fit(X_train, y_train)

print(grid_search_model)
# summarize the results of the grid search
print(grid_search_model.best_score_)
print(grid_search_model.best_params_)  

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'C': [1, 10, 100, 1000], 'degree': [1, 2, 4, 8, 10], 'kernel': ['poly']}, {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(fbeta_score, beta=10), verbose=0)
0.6117500376837378
{'C': 1000, 'kernel': 'linear'}


In [5]:

model = SVC(C=1000, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

display(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
print("Training Accuracy:", model.score(X_train, y_train))
print("Test Accuracy    :", model.score(X_test, y_test))
print("F10 Score        :", fbeta_score(y_test, y_pred, 10))

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,15284,10023,25307
1,93,337,430
All,15377,10360,25737


Training Accuracy: 0.6081984198937962
Test Accuracy    : 0.6069471966429654
F10 Score        : 0.6378748125937033


In [6]:
model.coef_

array([[-4.99421323e-01, -5.00024894e-01,  1.49982724e+00,
        -5.00380716e-01,  4.00041663e-01,  4.00013234e-01,
        -1.60037041e+00,  4.00191192e-01,  4.00124621e-01,
         3.99686178e-01,  3.99974455e-01, -5.99622322e-01,
        -5.99855509e-01,  3.99817474e-01, -3.16624064e-05,
         4.21013683e-05, -1.01160258e-05,  1.00011300e+00,
         1.69193372e-05, -1.78976232e-04, -9.99791274e-01,
        -1.59390271e-04, -3.99608089e-01,  5.99812462e-01,
        -4.00065649e-01, -3.99928513e-01,  5.99790044e-01]])