In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn import grid_search
from sklearn import cross_validation
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
sns.set_style('whitegrid') 

In [2]:
X_df = pd.read_pickle('new_X_df.save')
outcomes_df = pd.read_pickle('new_outcomes_df.save')

In [3]:
X_train, X_test, y_train, y_test = \
    cross_validation.train_test_split(X_df, outcomes_df['Outcome'].values, \
                                      test_size=0.25, random_state=2) # was 0

In [4]:
C = [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4]
param_grid = {'penalty':['l1', 'l2'], 'C': C}

In [5]:
logistic_regressions = grid_search.GridSearchCV(LogisticRegression(class_weight='auto'), cv=5, 
                                                param_grid=param_grid, scoring='roc_auc', 
                                                n_jobs=-1, verbose=1)

In [6]:
logistic_regressions.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done  66 out of  80 | elapsed:   18.6s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   21.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight='auto', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 100.0, 1000.0, 10000.0]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='roc_auc', verbose=1)

In [7]:
for score in logistic_regressions.grid_scores_:
    print score

mean: 0.75752, std: 0.00348, params: {'penalty': 'l1', 'C': 0.0001}
mean: 0.76151, std: 0.00380, params: {'penalty': 'l2', 'C': 0.0001}
mean: 0.77220, std: 0.00328, params: {'penalty': 'l1', 'C': 0.001}
mean: 0.76244, std: 0.00379, params: {'penalty': 'l2', 'C': 0.001}
mean: 0.79051, std: 0.00227, params: {'penalty': 'l1', 'C': 0.01}
mean: 0.76325, std: 0.00325, params: {'penalty': 'l2', 'C': 0.01}
mean: 0.79069, std: 0.00237, params: {'penalty': 'l1', 'C': 0.1}
mean: 0.76230, std: 0.00344, params: {'penalty': 'l2', 'C': 0.1}
mean: 0.79051, std: 0.00237, params: {'penalty': 'l1', 'C': 1}
mean: 0.76150, std: 0.00294, params: {'penalty': 'l2', 'C': 1}
mean: 0.79048, std: 0.00236, params: {'penalty': 'l1', 'C': 100.0}
mean: 0.76297, std: 0.00389, params: {'penalty': 'l2', 'C': 100.0}
mean: 0.79047, std: 0.00237, params: {'penalty': 'l1', 'C': 1000.0}
mean: 0.76179, std: 0.00350, params: {'penalty': 'l2', 'C': 1000.0}
mean: 0.79048, std: 0.00237, params: {'penalty': 'l1', 'C': 10000.0}
mea

In [8]:
logistic_regressions.best_params_

{'C': 0.1, 'penalty': 'l1'}

In [9]:
logistic_regressions.best_score_

0.79068810523060462

In [10]:
logistic_best = logistic_regressions.best_estimator_

In [12]:
from sklearn.externals import joblib
joblib.dump(logistic_best, 'logistic_best_june28/logistic_best.pkl') 

['logistic_best_june28/logistic_best.pkl',
 'logistic_best_june28/logistic_best.pkl_01.npy',
 'logistic_best_june28/logistic_best.pkl_02.npy',
 'logistic_best_june28/logistic_best.pkl_03.npy']

In [13]:
# now require l2 regularization for predictor coeficients 

In [25]:
C = [1e-5, 1e-4, 1e-3, 1e-2, .5e-1, 1e-1, 1, 1e2, 1e3, 1e4, 1e5]
param_grid = {'penalty':['l2'], 'C': C}

In [26]:
logistic_regressions = grid_search.GridSearchCV(LogisticRegression(class_weight='auto'), cv=5, 
                                                param_grid=param_grid, scoring='roc_auc', 
                                                n_jobs=-1, verbose=1)

In [27]:
logistic_regressions.fit(X_train, y_train)

Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:   20.0s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight='auto', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'penalty': ['l2'], 'C': [1e-05, 0.0001, 0.001, 0.01, 0.05, 0.1, 1, 100.0, 1000.0, 10000.0, 100000.0]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='roc_auc', verbose=1)

In [28]:
for score in logistic_regressions.grid_scores_:
    print score

mean: 0.76040, std: 0.00335, params: {'penalty': 'l2', 'C': 1e-05}
mean: 0.76255, std: 0.00303, params: {'penalty': 'l2', 'C': 0.0001}
mean: 0.76194, std: 0.00295, params: {'penalty': 'l2', 'C': 0.001}
mean: 0.76109, std: 0.00320, params: {'penalty': 'l2', 'C': 0.01}
mean: 0.76174, std: 0.00284, params: {'penalty': 'l2', 'C': 0.05}
mean: 0.76173, std: 0.00347, params: {'penalty': 'l2', 'C': 0.1}
mean: 0.76142, std: 0.00335, params: {'penalty': 'l2', 'C': 1}
mean: 0.76247, std: 0.00362, params: {'penalty': 'l2', 'C': 100.0}
mean: 0.76190, std: 0.00345, params: {'penalty': 'l2', 'C': 1000.0}
mean: 0.76160, std: 0.00297, params: {'penalty': 'l2', 'C': 10000.0}
mean: 0.76179, std: 0.00314, params: {'penalty': 'l2', 'C': 100000.0}


In [30]:
logistic_regressions.best_params_

{'C': 0.0001, 'penalty': 'l2'}

In [33]:
logistic_best_l2 = logistic_regressions.best_estimator_

In [34]:
from sklearn.externals import joblib
joblib.dump(logistic_best_l2, 'logistic_best_l2_june28/logistic_best_l2.pkl') 

['logistic_best_l2_june28/logistic_best_l2.pkl',
 'logistic_best_l2_june28/logistic_best_l2.pkl_01.npy',
 'logistic_best_l2_june28/logistic_best_l2.pkl_02.npy',
 'logistic_best_l2_june28/logistic_best_l2.pkl_03.npy']