In [1]:
#Import our dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

## Data preprocessing and feature engineering

In [2]:
# Import our data set
df = pd.read_csv('diabetic_data.csv')

In [3]:
df.columns

# ['race', 'gender', 'age','time_in_hospital']

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [5]:
df['change']

0         No
1         Ch
2         No
3         Ch
4         Ch
          ..
101761    Ch
101762    No
101763    Ch
101764    Ch
101765    No
Name: change, Length: 101766, dtype: object

In [186]:
df['readmitted'].value_counts()

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

In [187]:
# Turn this into a binary classification by binning patients with no chance and a less than 30% of hospital
# readmission together using the pandas .replace() function and a dictionary
# Our labels will be NO, <30, and >30 in regards to the amount of days a patient takes to be readmitted (if at all)
df['race'].replace(to_replace={'?':'Other'}, inplace=True)

In [188]:
# Columns we manually identified, and columns we identified through RFC importance ranking that would not
# offer much information to our model (considering further optimizations, but we removed columns with 0% importance)
columns_to_drop = ['encounter_id', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
                   'patient_nbr', 'payer_code',
                   'medical_specialty', 'readmitted',
                   'diag_1', 'diag_2', 'diag_3',
                   'metformin-rosiglitazone', 'examide',
                   'citoglipton', 'tolazamide', 'metformin-pioglitazone',
                   'acetohexamide', 'chlorpropamide', 'glimepiride-pioglitazone',
                  'tolbutamide', 'nateglinide', 'glipizide-metformin', 'acarbose',
                  'troglitazone', 'glyburide-metformin', 'miglitol', 'acarbose', 'gender', 'weight']

# drop the identified columns
#dropped_df = df.drop(columns=columns_to_drop, axis=1)

dropped_df = df[['race', 'gender', 'age','time_in_hospital']]

In [189]:
# Identify categorical columns and create a pd.get_dummies data frame based on those columns
object_columns = [i for i in dropped_df.columns if dropped_df[i].dtype == "object"]
dummy_columns = pd.get_dummies(dropped_df[object_columns])

# Merge the dropped data frame and the dummy dataframe together, and drop the original categorical columns
clean_df = pd.concat([dropped_df, dummy_columns], axis=1).drop(object_columns, axis=1)


clean_df

Unnamed: 0,time_in_hospital,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Female,gender_Male,gender_Unknown/Invalid,age_[0-10),age_[10-20),age_[20-30),age_[30-40),age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90),age_[90-100)
0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
1,3,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
2,2,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0
3,2,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
4,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,3,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
101762,5,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
101763,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
101764,10,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0


## Data preparation (scaling and train/test split)

In [190]:
# Declare our X and y variables
X = clean_df.values
y = df['readmitted'].values

from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

#Perform a train test split with random state = 1 for consistency
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=1, test_size=0.5)

# Scale our training and testing data using the min max scaler fit to our training data
scaler = MinMaxScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [191]:
len(y)

101766

## Model creation and assessment 

In [192]:
# We have decided to use the RFC model to begin our assessment of models, namely for its ability to rank the
# importance of the features we give it

rf_model = RandomForestClassifier(n_estimators=128, random_state=78).fit(X_train_scaled, y_train)

In [193]:

predictions = rf_model.predict(X_test_scaled)

In [194]:

acc_score = accuracy_score(y_test, predictions)

print(f'the accuracy for our model was {acc_score * 100:.2f}%')

the accuracy for our model was 99.85%


In [195]:
importances = rf_model.feature_importances_

sorted(zip(rf_model.feature_importances_, clean_df.columns), reverse=True)

[(0.4338812548617777, 'time_in_hospital'),
 (0.07558281883050044, 'age_[70-80)'),
 (0.06117045549418373, 'gender_Female'),
 (0.05636757157483222, 'age_[60-70)'),
 (0.05511375977994534, 'gender_Male'),
 (0.04672452346156652, 'race_Caucasian'),
 (0.04442381776364731, 'age_[50-60)'),
 (0.03317842769332179, 'race_Other'),
 (0.03197557464493517, 'race_AfricanAmerican'),
 (0.02994300428025407, 'age_[80-90)'),
 (0.02976524114676486, 'age_[40-50)'),
 (0.02203164051419476, 'age_[90-100)'),
 (0.019188957553409602, 'age_[30-40)'),
 (0.01562001737778435, 'race_Asian'),
 (0.013619423136943356, 'race_Hispanic'),
 (0.012127056858246133, 'age_[0-10)'),
 (0.00964510564373435, 'age_[20-30)'),
 (0.009481734310119647, 'age_[10-20)'),
 (0.00015961507383859805, 'gender_Unknown/Invalid')]

In [196]:
# as a group we have decided recall is more important in this situation


# Calculating a confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual <30", "Actual >30", "Actual NO"], columns=["Predicted <30", "Predicted >30", "Predicted NP"])

cm_df

Unnamed: 0,Predicted <30,Predicted >30,Predicted NP
Actual <30,1605,0,0
Actual >30,1,1227,4
Actual NO,1,0,1134


In [197]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         <30       1.00      1.00      1.00      1605
         >30       1.00      1.00      1.00      1232
          NO       1.00      1.00      1.00      1135

    accuracy                           1.00      3972
   macro avg       1.00      1.00      1.00      3972
weighted avg       1.00      1.00      1.00      3972



In [198]:
test_data = clean_df.loc[0].values.reshape(-1,19)

In [199]:
test_data_scaled = scaler.transform(test_data)

test_data_scaled

array([[0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]])

In [200]:
rf_model.predict(test_data_scaled)

array(['NO'], dtype=object)

In [201]:
len(predictions)

3972

In [202]:
y_test[0]

'>30'

In [203]:
predictions[0]

'>30'

In [204]:
for i in range(11):
    print(f'{predictions[i]}, {y_test[i]}')

>30, >30
>30, >30
NO, NO
<30, <30
NO, NO
NO, NO
>30, >30
>30, >30
NO, NO
>30, >30
NO, NO
