In [1]:
#Import our dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

## Data preprocessing and feature engineering

In [2]:
# # Import our data set
# df = pd.read_csv('diabetic_data.csv')

In [3]:
# Database engine creation

from sqlalchemy import create_engine
import psycopg2
from config import db_password

database_S = f"postgres://postgres:{db_password}@localhost/diabetic_data"

In [4]:
# Create the database engine 
engine = create_engine(database_S)

In [5]:
# #Import diabetic_Data
# diabetic_data.to_sql(name="diabetic_data", con=engine,  if_exists='replace')

In [6]:
# check 1st dataset (count by index)
df = pd.read_sql_query('select * FROM "diabetic_data"',con=engine, index_col="index")
df

Unnamed: 0_level_0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [7]:
# Turn this into a binary classification by binning patients with no chance and a less than 30% of hospital
# readmission together using the pandas .replace() function and a dictionary
# Our labels will be <30 and >30 in regards to patient's likelyhood of readmission

replace_dict = {
    "NO": "<30",
    ">30": ">30",
    "<30": "<30"
}

df['readmitted'].replace(to_replace=replace_dict, inplace=True)

In [8]:
# Columns we manually identified, and columns we identified through RFC importance ranking that would not
# offer much information to our model (considering further optimizations, but we removed columns with 0% importance)
columns_to_drop = ['encounter_id', 'patient_nbr', 'payer_code',
                   'medical_specialty', 'readmitted',
                   'diag_1', 'diag_2', 'diag_3',
                   'metformin-rosiglitazone', 'examide',
                   'citoglipton', 'tolazamide', 'metformin-pioglitazone',
                   'acetohexamide', 'chlorpropamide', 'glimepiride-pioglitazone']

# drop the identified columns
dropped_df = df.drop(columns=columns_to_drop, axis=1)

In [9]:
# Identify categorical columns and create a pd.get_dummies data frame based on those columns
object_columns = [i for i in dropped_df.columns if dropped_df[i].dtype == "object"]
dummy_columns = pd.get_dummies(dropped_df[object_columns])

# Merge the dropped data frame and the dummy dataframe together, and drop the original categorical columns
clean_df = pd.concat([dropped_df, dummy_columns], axis=1).drop(object_columns, axis=1)

clean_df.head()

Unnamed: 0_level_0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,glyburide-metformin_Down,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_No,glipizide-metformin_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,6,25,1,1,41,0,1,0,0,0,...,0,1,0,0,1,0,0,1,1,0
1,1,1,7,3,59,0,18,0,0,0,...,0,1,0,0,1,0,1,0,0,1
2,1,1,7,2,11,5,13,2,0,1,...,0,1,0,0,1,0,0,1,0,1
3,1,1,7,2,44,1,16,0,0,0,...,0,1,0,0,1,0,1,0,0,1
4,1,1,7,1,51,0,8,0,0,0,...,0,1,0,0,1,0,1,0,0,1


## Data preparation (scaling and train/test split)

In [10]:
# Declare our X and y variables
X = clean_df.values
y = df['readmitted'].values

#Perform a train test split with random state = 1 for consistency
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Scale our training and testing data using the min max scaler fit to our training data
scaler = MinMaxScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model creation and assessment 

In [11]:
# We have decided to use the RFC model to begin our assessment of models, namely for its ability to rank the
# importance of the features we give it

rf_model = RandomForestClassifier(n_estimators=128, random_state=78).fit(X_train_scaled, y_train)

In [12]:

predictions = rf_model.predict(X_test_scaled)

In [13]:

acc_score = accuracy_score(y_test, predictions)

print(f'the accuracy for our model was {acc_score * 100:.2f}%')

the accuracy for our model was 65.24%


In [14]:
importances = rf_model.feature_importances_

sorted(zip(rf_model.feature_importances_, clean_df.columns), reverse=True)

[(0.13185498092137893, 'num_lab_procedures'),
 (0.11566082552309402, 'num_medications'),
 (0.0852397850599413, 'time_in_hospital'),
 (0.055654402190867, 'number_diagnoses'),
 (0.05242512834772306, 'discharge_disposition_id'),
 (0.05150139837449093, 'num_procedures'),
 (0.04172578321019531, 'number_inpatient'),
 (0.03413355197986164, 'admission_type_id'),
 (0.025865199075388368, 'admission_source_id'),
 (0.02402595584805963, 'number_outpatient'),
 (0.016904830699008333, 'number_emergency'),
 (0.016524548480446054, 'age_[60-70)'),
 (0.016161933788213022, 'age_[70-80)'),
 (0.015621517250862069, 'gender_Male'),
 (0.015528692293682577, 'gender_Female'),
 (0.014452246432689907, 'age_[50-60)'),
 (0.01396380465005364, 'age_[80-90)'),
 (0.012729963511555569, 'race_Caucasian'),
 (0.011612503785873413, 'race_AfricanAmerican'),
 (0.01075203784345621, 'A1Cresult_None'),
 (0.010733496252289847, 'insulin_Steady'),
 (0.010726571447939854, 'age_[40-50)'),
 (0.009657163523390088, 'insulin_No'),
 (0.0091

In [15]:
# as a group we have decided recall is more important in this situation


# Calculating a confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14803,1783
Actual 1,7060,1796


In [16]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         <30       0.68      0.89      0.77     16586
         >30       0.50      0.20      0.29      8856

    accuracy                           0.65     25442
   macro avg       0.59      0.55      0.53     25442
weighted avg       0.62      0.65      0.60     25442



In [17]:
# A decent start, though we definitely want to push the >30 recall score even higher!