In [63]:
#Import our dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

## Data preprocessing and feature engineering

In [64]:
# Import our data set
df = pd.read_csv('diabetic_data.csv')

In [65]:
df['readmitted'].value_counts()

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

In [66]:
# Turn this into a binary classification by binning patients with no chance and a less than 30% of hospital
# readmission together using the pandas .replace() function and a dictionary
# Our labels will be <30 and >30 in regards to patient's likelyhood of readmission

replace_dict = {
    "NO": "<30",
    ">30": ">30",
    "<30": "<30"
}

df['readmitted'].replace(to_replace=replace_dict, inplace=True)
df['race'].replace(to_replace={'?':'Other'}, inplace=True)

In [67]:
# Columns we manually identified, and columns we identified through RFC importance ranking that would not
# offer much information to our model (considering further optimizations, but we removed columns with 0% importance)
columns_to_drop = ['encounter_id', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
                   'patient_nbr', 'payer_code',
                   'medical_specialty', 'readmitted',
                   'diag_1', 'diag_2', 'diag_3',
                   'metformin-rosiglitazone', 'examide',
                   'citoglipton', 'tolazamide', 'metformin-pioglitazone',
                   'acetohexamide', 'chlorpropamide', 'glimepiride-pioglitazone',
                  'tolbutamide', 'nateglinide', 'glipizide-metformin', 'acarbose',
                  'troglitazone', 'glyburide-metformin', 'miglitol', 'acarbose']

# drop the identified columns
dropped_df = df.drop(columns=columns_to_drop, axis=1)

In [68]:
# Identify categorical columns and create a pd.get_dummies data frame based on those columns
object_columns = [i for i in dropped_df.columns if dropped_df[i].dtype == "object"]
dummy_columns = pd.get_dummies(dropped_df[object_columns])

# Merge the dropped data frame and the dummy dataframe together, and drop the original categorical columns
clean_df = pd.concat([dropped_df, dummy_columns], axis=1).drop(object_columns, axis=1)

clean_df.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_AfricanAmerican,race_Asian,...,rosiglitazone_Steady,rosiglitazone_Up,insulin_Down,insulin_No,insulin_Steady,insulin_Up,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
0,1,41,0,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,1,0
1,3,59,0,18,0,0,0,9,0,0,...,0,0,0,0,0,1,1,0,0,1
2,2,11,5,13,2,0,1,6,1,0,...,0,0,0,1,0,0,0,1,0,1
3,2,44,1,16,0,0,0,7,0,0,...,0,0,0,0,0,1,1,0,0,1
4,1,51,0,8,0,0,0,5,0,0,...,0,0,0,0,1,0,1,0,0,1


## Data preparation (scaling and train/test split)

In [69]:
# Declare our X and y variables
X = clean_df.values
y = df['readmitted'].values

from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

#Perform a train test split with random state = 1 for consistency
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=1)

# Scale our training and testing data using the min max scaler fit to our training data
scaler = MinMaxScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model creation and assessment 

In [70]:
# We have decided to use the RFC model to begin our assessment of models, namely for its ability to rank the
# importance of the features we give it

rf_model = RandomForestClassifier(n_estimators=128, random_state=78).fit(X_train_scaled, y_train)

In [71]:

predictions = rf_model.predict(X_test_scaled)

In [72]:

acc_score = accuracy_score(y_test, predictions)

print(f'the accuracy for our model was {acc_score * 100:.2f}%')

the accuracy for our model was 82.82%


In [73]:
importances = rf_model.feature_importances_

sorted(zip(rf_model.feature_importances_, clean_df.columns), reverse=True)

[(0.12322044227994267, 'num_lab_procedures'),
 (0.1179223553419886, 'num_medications'),
 (0.08180341594273688, 'time_in_hospital'),
 (0.06788298243337237, 'number_diagnoses'),
 (0.06754014628635008, 'num_procedures'),
 (0.038076896716176714, 'number_inpatient'),
 (0.0240511745865383, 'gender_Female'),
 (0.022388911860648637, 'number_outpatient'),
 (0.020156978369852755, 'gender_Male'),
 (0.01942654825224997, 'age_[70-80)'),
 (0.018862936369799188, 'age_[60-70)'),
 (0.018850273727638277, 'insulin_Steady'),
 (0.01840576695170172, 'age_[50-60)'),
 (0.01729106979191556, 'diabetesMed_No'),
 (0.01686182084683674, 'age_[80-90)'),
 (0.015570348706312662, 'race_Caucasian'),
 (0.014893497624536917, 'insulin_No'),
 (0.014473353241854801, 'number_emergency'),
 (0.014275589764695598, 'race_AfricanAmerican'),
 (0.013960385247265486, 'age_[40-50)'),
 (0.013804726428486496, 'change_No'),
 (0.013475650758546315, 'metformin_Steady'),
 (0.011981518091245829, 'A1Cresult_None'),
 (0.011956704703568655, 'ch

In [74]:
# as a group we have decided recall is more important in this situation


# Calculating a confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2376,1567
Actual 1,695,8528


In [75]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         <30       0.77      0.60      0.68      3943
         >30       0.84      0.92      0.88      9223

    accuracy                           0.83     13166
   macro avg       0.81      0.76      0.78     13166
weighted avg       0.82      0.83      0.82     13166



## Neural Network Test

In [117]:
nn_y = df['readmitted'].replace(to_replace={"<30": 0, ">30":1}, inplace=False)
nn_X = clean_df.values

nn_X_resampled, nn_y_resampled = smote_enn.fit_resample(nn_X, nn_y)

nn_X_train, nn_X_test, nn_y_train, nn_y_test = train_test_split(nn_X_resampled, nn_y_resampled, random_state=1)


# Scale our training and testing data using the min max scaler fit to our training data
scaler = MinMaxScaler().fit(nn_X_train)

nn_X_train_scaled = scaler.transform(nn_X_train)
nn_X_test_scaled = scaler.transform(nn_X_test)

In [118]:
import tensorflow as tf

number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 240
hidden_nodes_layer2 = 100

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=50, activation="softmax"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_26 (Dense)             (None, 240)               19440     
_________________________________________________________________
dense_27 (Dense)             (None, 100)               24100     
_________________________________________________________________
dense_28 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 51        
Total params: 48,641
Trainable params: 48,641
Non-trainable params: 0
_________________________________________________________________


In [119]:
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [120]:
nn.fit(nn_X_train_scaled, nn_y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7fb97fde0910>

In [121]:
model_loss, model_accuracy = nn.evaluate(nn_X_test_scaled,nn_y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

412/412 - 0s - loss: 0.6111 - accuracy: 0.7005
Loss: 0.6110865473747253, Accuracy: 0.7005164623260498
