In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn import grid_search
from sklearn import cross_validation
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
sns.set_context('talk')
sns.set_style('whitegrid') 


In [2]:
%matplotlib qt

In [3]:
def plotROCCurve(y_test, prediction_proba):
    fig = plt.figure(figsize=(4,4))
    fpr, tpr, thresholds = roc_curve(y_test, prediction_proba[:,1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(bbox_to_anchor=(0.9, 0.2),
               bbox_transform=plt.gcf().transFigure)
    plt.savefig('random_forest_aucroc.png', bbox_inches='tight', dpi=200)

In [4]:
X_df = pd.read_pickle('new_X_df.save')
outcomes_df = pd.read_pickle('new_outcomes_df.save')

In [5]:
np.shape(X_df)

(115196, 38)

In [6]:
X_train, X_test, y_train, y_test = \
    cross_validation.train_test_split(X_df, outcomes_df['Outcome'].values, \
                                      test_size=0.25, random_state=2) # was 0

In [8]:
from sklearn import linear_model
clf = linear_model.LogisticRegression(C=1e5)
clf.fit(X_train,y_train)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0)

In [10]:
y_1 = clf.predict(X_test)
y_1_proba = clf.predict_proba(X_test)

In [11]:
metrics.roc_auc_score(y_test, y_1_proba[:,1]) # average='macro',

0.75837382543665977

In [12]:
metrics.accuracy_score(y_test,y_1, normalize=True)

0.68189867703739715

In [13]:
metrics.precision_recall_fscore_support(y_test, y_1)

(array([ 0.68172629,  0.68238576]),
 array([ 0.85845371,  0.43142689]),
 array([ 0.75995074,  0.52863391]),
 array([16892, 11907]))

In [14]:
print metrics.classification_report(y_test, y_1)

             precision    recall  f1-score   support

          0       0.68      0.86      0.76     16892
          1       0.68      0.43      0.53     11907

avg / total       0.68      0.68      0.66     28799



In [15]:
plotROCCurve(y_test, y_1_proba)

In [21]:
print clf.coef_[0]

[ -2.52178455e-05  -2.53854855e-02   3.09377221e-03  -3.48570989e-03
   3.93891683e-03   4.53088616e-02  -5.48147038e-03   2.89949307e-04
  -1.72157706e-06  -8.81601777e-04   2.47565450e-02   5.12392773e-03
   4.31433757e-04   3.11666613e-02   5.65758137e-03   3.70736040e-04
   6.47317685e-05  -3.04592767e-04   4.66418758e-04  -4.96512615e-04
  -9.52234881e-04   6.32852058e-04  -6.25482820e-04  -1.19143186e-03
  -1.28094499e-04   3.55266147e-03  -3.94250945e-04  -1.76284739e-03
  -5.97344276e-04   1.07860303e-03   3.48564689e-04   4.71123939e-06
   1.20432353e-04   6.21226158e-05   9.76574202e-05  -4.68909561e-04
  -1.92976091e-04   9.01730928e-05]


In [24]:
from plot_suggestions import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
plot_coefficients(clf.coef_[0],\
                        X_df.columns,len(clf.coef_[0]))

In [63]:
best_clf = clf_grid.best_estimator_

In [64]:
from sklearn.externals import joblib
joblib.dump(best_clf, 'clf_grid_june24pm.pkl') 

['clf_grid_june24pm.pkl',
 'clf_grid_june24pm.pkl_01.npy',
 'clf_grid_june24pm.pkl_02.npy',
 'clf_grid_june24pm.pkl_03.npy',
 'clf_grid_june24pm.pkl_04.npy',
 'clf_grid_june24pm.pkl_05.npy',
 'clf_grid_june24pm.pkl_06.npy',
 'clf_grid_june24pm.pkl_07.npy',
 'clf_grid_june24pm.pkl_08.npy',
 'clf_grid_june24pm.pkl_09.npy',
 'clf_grid_june24pm.pkl_10.npy',
 'clf_grid_june24pm.pkl_11.npy',
 'clf_grid_june24pm.pkl_12.npy',
 'clf_grid_june24pm.pkl_13.npy',
 'clf_grid_june24pm.pkl_14.npy',
 'clf_grid_june24pm.pkl_15.npy',
 'clf_grid_june24pm.pkl_16.npy',
 'clf_grid_june24pm.pkl_17.npy',
 'clf_grid_june24pm.pkl_18.npy',
 'clf_grid_june24pm.pkl_19.npy',
 'clf_grid_june24pm.pkl_20.npy',
 'clf_grid_june24pm.pkl_21.npy',
 'clf_grid_june24pm.pkl_22.npy',
 'clf_grid_june24pm.pkl_23.npy',
 'clf_grid_june24pm.pkl_24.npy',
 'clf_grid_june24pm.pkl_25.npy',
 'clf_grid_june24pm.pkl_26.npy',
 'clf_grid_june24pm.pkl_27.npy',
 'clf_grid_june24pm.pkl_28.npy',
 'clf_grid_june24pm.pkl_29.npy',
 'clf_grid_june24