In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn import ensemble
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
le = preprocessing.LabelEncoder()
from sklearn.model_selection import train_test_split
import pickle
import os


data=pd.read_csv('./data_rf_ready.csv')

In [2]:
data.head()


Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
data['admission_type_id'] = pd.Categorical(data.admission_type_id)
data['discharge_disposition_id'] = pd.Categorical(data.discharge_disposition_id)
data['admission_source_id'] = pd.Categorical(data.admission_source_id)

cat_cols = list(data.select_dtypes(include=[object]).columns)

for col in cat_cols:
    data[col] = pd.Categorical(data[col])


In [4]:
col_to_encode = data[list(data.select_dtypes(include=['category']).columns)]


In [5]:
for col in col_to_encode:
    data[col] = le.fit_transform(data[col])


In [6]:
readmitted = data['readmitted']
data = data.drop('readmitted', axis = 1)

In [7]:
data.dtypes

race                        int32
gender                      int32
age                         int32
admission_type_id           int64
discharge_disposition_id    int64
admission_source_id         int64
time_in_hospital            int64
num_lab_procedures          int64
num_procedures              int64
num_medications             int64
number_outpatient           int64
number_emergency            int64
number_inpatient            int64
number_diagnoses            int64
max_glu_serum               int32
A1Cresult                   int32
metformin                   int32
repaglinide                 int32
nateglinide                 int32
chlorpropamide              int32
glimepiride                 int32
acetohexamide               int32
glipizide                   int32
glyburide                   int32
tolbutamide                 int32
pioglitazone                int32
rosiglitazone               int32
acarbose                    int32
miglitol                    int32
troglitazone  

In [8]:
data

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,2,0,0,5,23,0,1,41,0,1,...,0,0,1,1,0,0,0,0,1,0
1,2,0,1,0,0,6,3,59,0,18,...,0,0,3,1,0,0,0,0,0,1
2,0,0,2,0,0,6,2,11,5,13,...,0,0,1,1,0,0,0,0,1,1
3,2,1,3,0,0,6,2,44,1,16,...,0,0,3,1,0,0,0,0,0,1
4,2,1,4,0,0,6,1,51,0,8,...,0,0,2,1,0,0,0,0,0,1
5,2,1,5,1,0,1,3,31,6,16,...,0,0,2,1,0,0,0,0,1,1
6,2,1,6,2,0,1,4,70,1,21,...,0,0,2,1,0,0,0,0,0,1
7,2,1,7,0,0,6,5,73,0,12,...,0,0,1,1,0,0,0,0,1,1
8,2,0,8,1,0,3,13,68,2,28,...,0,0,2,1,0,0,0,0,0,1
9,2,0,9,2,2,3,12,33,3,18,...,0,0,2,1,0,0,0,0,0,1


In [30]:

##predict test data
#randomForest.predict(test)
model=ensemble.RandomForestClassifier(random_state=0)

###Grid Search
grid_para_forest = [{
    "n_estimators": [650],
    "min_samples_leaf": [2],
    "min_samples_split": [2],
    "random_state": [0]}]
grid_search_forest = GridSearchCV(model, grid_para_forest, cv=4)
grid_search_forest.fit(data, readmitted)

bestparam= grid_search_forest.best_params_
bestscore= grid_search_forest.best_score_

# Train test split
#X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_train, y_train,  random_state = 42, test_size = 0.2)


# Fit the model
model = ensemble.RandomForestRegressor(**bestparam)
#model.set_params(bestparam)
model.fit(data, readmitted)
model.score(data, readmitted)



##feature importance
feature_importance = list(zip(data.columns, model.feature_importances_))
dtype = [('feature', 'S10'), ('importance', 'float')]
feature_importance = np.array(feature_importance, dtype=dtype)
feature_sort = np.sort(feature_importance, order='importance')[::-1]
name, score = zip(*list(feature_sort))
fea_i=pd.DataFrame({'name':name,'score':score})
fea_i[:10].plot.bar(x='name', y='score')
fea_i

my_fig = fea_i[:10].plot.bar(x='name', y='score')
my_fig.tick_params(labelsize=14)
my_fig.figure.savefig('my_fig.png', dpi=300, bbox_inches = 'tight')


KeyboardInterrupt: 