In [13]:
#import libs
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
import pickle

In [14]:
#open x abd y df's
with open('x_df', 'rb') as x:
    x_df = pickle.load(x)

In [15]:
with open('y_df', 'rb') as y:
    y_df = pickle.load(y)

In [16]:
#split x and y data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(x_df,
                                  y_df,
                                  test_size=0.2,
                                  random_state=1)

In [17]:
#scoring metric to optimize
score = 'roc_auc'

# Model 1: SVM

In [18]:
#setup grid search
grid_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]}]

#run grid search to find best gamma
clf = GridSearchCV(SVC(probability=True),
                   grid_parameters,
                   cv=3,
                   scoring=score,
                   n_jobs = -1)

#fit SVM
#NOTE: This will take a long time!!!!!!
clf.fit(X_train, y_train)


GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.001, 0.0001], 'C': [1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [19]:
#Generate predicted probabilites
clf_probs = clf.predict_proba(X_test)
print('AUC: ', roc_auc_score(y_test, clf_probs[:,1]))
print('Accuracy: ', clf.score(X_test, y_test))

AUC:  0.9916754740287814
Accuracy:  0.9916754740287813


In [20]:
#Pickle model 1
with open('clf', 'wb') as c:
    pickle.dump(clf, c, pickle.HIGHEST_PROTOCOL)

# Model 2: Logistic Regression w/ Feature Reduction

In [21]:
#Setup recursive feature reduction w/ cross validation
clf2 = RFECV(LogisticRegression(),
      scoring = score,
      n_jobs = -1,
      cv = 3,
      step = 5)

In [22]:
clf2.fit(X_train, y_train)

RFECV(cv=3,
   estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
   n_jobs=-1, scoring='roc_auc', step=5, verbose=0)

In [23]:
#Generate predicted probabilites
clf2_probs = clf2.predict_proba(X_test)
print('AUC: ', roc_auc_score(y_test, clf2_probs[:,1]))
print('Accuracy: ', clf2.score(X_test, y_test))

AUC:  0.9088576122537212
Accuracy:  0.893625


In [24]:
#Pickle model 2
with open('clf2', 'wb') as c:
    pickle.dump(clf2, c, pickle.HIGHEST_PROTOCOL)

In [27]:
clf2_probs[:,1]

array([0.37702643, 0.0238426 , 0.025249  , ..., 0.52471461, 0.2553194 ,
       0.04357831])