In [1]:
import pickle
import pandas as pd
import seaborn as sns
import numpy as np
%matplotlib inline

import matplotlib.pyplot as plt




In [2]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV



In [3]:
# Suppress unnecessary warnings so that
# presentation looks clean
import warnings
warnings.filterwarnings('ignore')

In [4]:
with open('data4.pkl', 'r') as picklefile:
    data = pickle.load(picklefile)

In [5]:
print data['readmitted'].value_counts()/len(data)

0    0.656136
1    0.343864
Name: readmitted, dtype: float64


In [6]:
data2 = data.ix[:, data.columns != 'patient_nbr']

In [7]:
data2.head()

Unnamed: 0,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,change,...,rosiglitazone_Steady,rosiglitazone_Up,insulin_Down,insulin_No,insulin_Steady,insulin_Up,glyburide-metformin_Down,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up
0,0,-0.715412,-0.681967,3.509109,-0.175898,-0.036208,-0.030102,0.226905,0.869955,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,-0.50336,0.047059,-0.524282,-0.598981,19.567446,0.002662,-0.111221,0.869955,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0,-0.317815,1.862124,-0.524282,0.008523,-0.08534,-0.030102,-0.111221,0.869955,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,1.405105,-0.557763,0.484066,-0.338622,-0.08534,-0.030102,-0.22393,-0.370254,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,-0.715412,-1.233987,-0.524282,-0.696615,0.356847,0.002662,-0.22393,-0.874089,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
X = data2.ix[:, data2.columns != 'readmitted']
y = data['readmitted']



# Random Forest

In [9]:
#Random Forest
PREDICTOR = RandomForestClassifier(n_estimators=100).fit(X,y)
PREDICTOR.score(X,y)

1.0

In [10]:
#Grid search parameters
PARAMETERS = {'max_depth':[1,2,3,8,14,20,30]}

#Grid Search for RF
model = RandomForestClassifier()
rfc = GridSearchCV(model, PARAMETERS, verbose=True, n_jobs=-1)
rfc.fit(X, y)

#After completion, show the final best results and scores
print rfc.best_estimator_
print rfc.best_score_


Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   13.9s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=14, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.678783202595


In [14]:
pd.DataFrame({'feature':X.columns,
              'importance':PREDICTOR.feature_importances_}).sort_values('importance',
                                                                      ascending=False).head(10)

Unnamed: 0,feature,importance
2,num_lab_procedures,0.085975
4,num_medications,0.076199
1,time_in_hospital,0.05545
7,number_inpatient,0.045864
8,number_diagnoses,0.041034
3,num_procedures,0.038882
0,gender,0.019748
5,number_outpatient,0.018839
41,diag_3_Circulatory,0.01656
33,diag_2_Circulatory,0.014741


# PCA

In [15]:
pca = PCA(n_components=30)
pca.fit(X)
x = pca.explained_variance_ratio_
sum(x)

0.82778658866421928

In [17]:
#Random Forest with PCA
PREDICTORpca = RandomForestClassifier(n_estimators=100).fit(X,y)

In [19]:
#Parameters for PCA Grid Search
PARAMETERSpca = {'max_depth':[7,8,9,10], 'max_features':[5,10], 
              'max_leaf_nodes':[5,10], 'min_samples_leaf':[2],
              'min_samples_split':[2,4,6]}

#Grid Search for PCA
model = RandomForestClassifier()
clf = GridSearchCV(model, PARAMETERSpca, verbose=True, n_jobs=-1)
clf.fit(X, y)

#After completion, show the final best results and scores
print clf.best_estimator_
print clf.best_score_


Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   42.4s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features=10, max_leaf_nodes=10,
            min_impurity_split=1e-07, min_samples_leaf=2,
            min_samples_split=6, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.660336919714
