In [297]:
#Import our dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

## Data preprocessing and feature engineering

In [298]:
# Import our data set
df = pd.read_csv('diabetic_data.csv')

In [299]:
# Turn this into a binary classification by binning patients with no chance and a less than 30% of hospital
# readmission together using the pandas .replace() function and a dictionary
# Our labels will be <30 and >30 in regards to patient's likelyhood of readmission

replace_dict = {
    "NO": "<30",
    ">30": ">30",
    "<30": "<30"
}

df['readmitted'].replace(to_replace=replace_dict, inplace=True)

In [300]:
# Columns we manually identified, and columns we identified through RFC importance ranking that would not
# offer much information to our model (considering further optimizations, but we removed columns with 0% importance)
columns_to_drop = ['encounter_id', 'patient_nbr', 'payer_code',
                   'medical_specialty', 'readmitted',
                   'diag_1', 'diag_2', 'diag_3',
                   'metformin-rosiglitazone', 'examide',
                   'citoglipton', 'tolazamide', 'metformin-pioglitazone',
                   'acetohexamide', 'chlorpropamide', 'glimepiride-pioglitazone']

# drop the identified columns
dropped_df = df.drop(columns=columns_to_drop, axis=1)

In [301]:
# Identify categorical columns and create a pd.get_dummies data frame based on those columns
object_columns = [i for i in dropped_df.columns if dropped_df[i].dtype == "object"]
dummy_columns = pd.get_dummies(dropped_df[object_columns])

# Merge the dropped data frame and the dummy dataframe together, and drop the original categorical columns
clean_df = pd.concat([dropped_df, dummy_columns], axis=1).drop(object_columns, axis=1)

clean_df.head()

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,glyburide-metformin_Down,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_No,glipizide-metformin_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
0,6,25,1,1,41,0,1,0,0,0,...,0,1,0,0,1,0,0,1,1,0
1,1,1,7,3,59,0,18,0,0,0,...,0,1,0,0,1,0,1,0,0,1
2,1,1,7,2,11,5,13,2,0,1,...,0,1,0,0,1,0,0,1,0,1
3,1,1,7,2,44,1,16,0,0,0,...,0,1,0,0,1,0,1,0,0,1
4,1,1,7,1,51,0,8,0,0,0,...,0,1,0,0,1,0,1,0,0,1


## Data preparation (scaling and train/test split)

In [302]:
# Declare our X and y variables
X = clean_df.values
y = df['readmitted'].values

#Perform a train test split with random state = 1 for consistency
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Scale our training and testing data using the min max scaler fit to our training data
scaler = MinMaxScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model creation and assessment 

In [303]:
# We have decided to use the RFC model to begin our assessment of models, namely for its ability to rank the
# importance of the features we give it

rf_model = RandomForestClassifier(n_estimators=128, random_state=78).fit(X_train_scaled, y_train)

In [304]:

predictions = rf_model.predict(X_test_scaled)

In [305]:

acc_score = accuracy_score(y_test, predictions)

print(f'the accuracy for our model was {acc_score * 100:.2f}%')

the accuracy for our model was 65.24%


In [306]:
importances = rf_model.feature_importances_

sorted(zip(rf_model.feature_importances_, clean_df.columns), reverse=True)

In [291]:
# as a group we have decided recall is more important in this situation


# Calculating a confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14792,1791
Actual 1,7052,1807


In [292]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         <30       0.68      0.89      0.77     16583
         >30       0.50      0.20      0.29      8859

    accuracy                           0.65     25442
   macro avg       0.59      0.55      0.53     25442
weighted avg       0.62      0.65      0.60     25442



In [None]:
# A decent start, though we definitely want to push the >30 recall score even higher!