# Train and Test Machine Learning Model on Diabetic Patients Data

This is done by training a Random Forest Classifier model which will be used to predict whether a diabetic patient will likely be re-admitted or not

In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [13]:
# Preprocessing
df_diabetic = pd.read_csv('data/diabetic.csv')
df_diabetic.replace('?', np.nan, inplace=True)
df_diabetic.drop(['weight', 'medical_specialty','payer_code','encounter_id','patient_nbr','admission_type_id','discharge_disposition_id','admission_source_id'], axis=1, inplace=True)
df_diabetic = df_diabetic[df_diabetic['gender']!='Unknow/Invalid']
df_diabetic.dropna(inplace=True)





In [14]:
df_diabetic.head()

Unnamed: 0,race,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,Caucasian,Female,[10-20),3,59,0,18,0,0,0,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),2,11,5,13,2,0,1,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),2,44,1,16,0,0,0,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,51,0,8,0,0,0,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,Caucasian,Male,[50-60),3,31,6,16,0,0,0,...,No,Steady,No,No,No,No,No,No,Yes,>30


In [15]:
X = df_diabetic.drop('readmitted', axis=1)
y = df_diabetic['readmitted']

In [16]:
# Encode non-numeric columns
cols = list(X.select_dtypes('object').columns)
class_dict = {}
for col in cols:
    X = pd.concat([X.drop(col, axis=1), pd.get_dummies(X[col])], axis=1)

In [17]:
def convert_readmitted(series):
    if series.lower() == 'NO':
        return 0
    elif series == '<30':
        return 1
    else:
        return 2

In [18]:
y = df_diabetic['readmitted'].apply(convert_readmitted)

In [20]:
X.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,AfricanAmerican,Asian,...,Steady,No,Steady.1,No.1,No.2,Steady.2,Ch,No.3,No.4,Yes
1,3,59,0,18,0,0,0,9,0,0,...,0,1,0,1,1,0,1,0,0,1
2,2,11,5,13,2,0,1,6,1,0,...,0,1,0,1,1,0,0,1,0,1
3,2,44,1,16,0,0,0,7,0,0,...,0,1,0,1,1,0,1,0,0,1
4,1,51,0,8,0,0,0,5,0,0,...,0,1,0,1,1,0,1,0,0,1
5,3,31,6,16,0,0,0,9,0,0,...,0,1,0,1,1,0,0,1,0,1


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size = 0.2)

In [23]:
random_forest = RandomForestClassifier(n_estimators=300).fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
acc_score = metrics.accuracy_score(y_test, y_pred)

In [24]:
acc_score

0.8903676508082199