In [699]:
import pandas as pd
import numpy as np

## General Preprocessing

In [700]:
df = pd.read_csv('diabetic_data.csv')

In [701]:
# Drop additional patient encounters
df.drop_duplicates(subset=['patient_nbr'], keep ='first', inplace=True)

In [702]:
# Drop identifying columns
identifying_columns = ['encounter_id', 'patient_nbr']
df.drop(identifying_columns, axis=1, inplace=True)

In [703]:
# Drop columns with high % of missing data
incomplete_columns = ['weight', 'payer_code', 'medical_specialty']
df.drop(incomplete_columns, axis=1, inplace=True)

In [704]:
# Drop rows with unknown values
df.drop(df[df.race == "?"].index, inplace=True)
df.drop(df[df.gender == "Unknown/Invalid"].index, inplace=True)

# Drop rows with patients who are still in the hospital or expired (dead).

dispos_to_drop = [20, 21, 22, 23, 24, 25, 26 , 27, 28, 29, 40, 41, 49, 11, 13, 14, 19]
for i in dispos_to_drop:
    df.drop(df[df.discharge_disposition_id == i].index, inplace=True)

## Binning diagnoses

In [705]:

df = df[pd.to_numeric(df['diag_1'], errors='coerce').notnull()]
df = df[pd.to_numeric(df['diag_2'], errors='coerce').notnull()]
df = df[pd.to_numeric(df['diag_3'], errors='coerce').notnull()]

# Convert to Float64 dtypes because we want the decimals
df.diag_1 = df.diag_1.astype('float64')
df.diag_2 = df.diag_2.astype('float64')
df.diag_3 = df.diag_3.astype('float64')

In [706]:
# Converting the first diagnosis

for i, row in df.iterrows():
    if (row['diag_1'] >= 1 and row['diag_1'] <= 139): # infectious and parasitic diseases
        df.loc[i, 'diag_1'] = 1
    elif (row['diag_1'] >= 140 and row['diag_1'] <= 239): # neoplasms
        df.loc[i, 'diag_1'] = 2
    elif (row['diag_1'] >= 240 and row['diag_1'] <= 279): # endocrine, nutritional, metabolic & immunity disorders
        df.loc[i, 'diag_1'] = 3
    elif (row['diag_1'] >= 280 and row['diag_1'] <= 289): # diseases of the blood and blood-forming organis
        df.loc[i, 'diag_1'] = 4
    elif (row['diag_1'] >= 290 and row['diag_1'] <= 319): # mental disorders
        df.loc[i, 'diag_1'] = 5
    elif (row['diag_1'] >= 320 and row['diag_1'] <= 389): # diseases of the nervous system and sense organs
        df.loc[i, 'diag_1'] = 6
    elif (row['diag_1'] >= 390 and row['diag_1'] <= 459): # diseases of the circulatory system
        df.loc[i, 'diag_1'] = 7
    elif (row['diag_1'] >= 460 and row['diag_1'] <= 519): # diseases of the respiratory system
        df.loc[i, 'diag_1'] = 8
    elif (row['diag_1'] >= 520 and row['diag_1'] <= 579): # diseases of the digestive system
        df.loc[i, 'diag_1'] = 9
    elif (row['diag_1'] >= 580 and row['diag_1'] <= 629): # diseases of the genitourinary system
        df.loc[i, 'diag_1'] = 10
    elif (row['diag_1'] >= 630 and row['diag_1'] <= 679): # complications of pregnancy, childbirth and the puerperium
        df.loc[i, 'diag_1'] = 11
    elif (row['diag_1'] >= 680 and row['diag_1'] <= 709): # diseases of the skin and subcutaneuous tissue
        df.loc[i, 'diag_1'] = 12
    elif (row['diag_1'] >= 710 and row['diag_1'] <= 739): # diseases of the muscuskeletal system and connective tissue
        df.loc[i, 'diag_1'] = 13
    elif (row['diag_1'] >= 740 and row['diag_1'] <= 759): # congenital anomalies
        df.loc[i, 'diag_1'] = 14
    elif (row['diag_1'] >= 760 and row['diag_1'] <= 779): # certain condtions originating in the perinatal period
        df.loc[i, 'diag_1'] = 15
    elif (row['diag_1'] >= 780 and row['diag_1'] <= 799): # symptoms, signs and ill-defined conditions
        df.loc[i, 'diag_1'] = 16
    elif (row['diag_1'] >= 800 and row['diag_1'] <= 999): # injury and poisoning
        df.loc[i, 'diag_1'] = 17

In [707]:
# Converting the second diagnosis


for i, row in df.iterrows():
    if (row['diag_2'] >= 1 and row['diag_2'] <= 139): # infectious and parasitic diseases
        df.loc[i, 'diag_2'] = 1
    elif (row['diag_2'] >= 140 and row['diag_2'] <= 239): # neoplasms
        df.loc[i, 'diag_2'] = 2
    elif (row['diag_2'] >= 240 and row['diag_2'] <= 279): # endocrine, nutritional, metabolic & immunity disorders
        df.loc[i, 'diag_2'] = 3
    elif (row['diag_2'] >= 280 and row['diag_2'] <= 289): # diseases of the blood and blood-forming organis
        df.loc[i, 'diag_2'] = 4
    elif (row['diag_2'] >= 290 and row['diag_2'] <= 319): # mental disorders
        df.loc[i, 'diag_2'] = 5
    elif (row['diag_2'] >= 320 and row['diag_2'] <= 389): # diseases of the nervous system and sense organs
        df.loc[i, 'diag_2'] = 6
    elif (row['diag_2'] >= 390 and row['diag_2'] <= 459): # diseases of the circulatory system
        df.loc[i, 'diag_2'] = 7
    elif (row['diag_2'] >= 460 and row['diag_2'] <= 519): # diseases of the respiratory system
        df.loc[i, 'diag_2'] = 8
    elif (row['diag_2'] >= 520 and row['diag_2'] <= 579): # diseases of the digestive system
        df.loc[i, 'diag_2'] = 9
    elif (row['diag_2'] >= 580 and row['diag_2'] <= 629): # diseases of the genitourinary system
        df.loc[i, 'diag_2'] = 10
    elif (row['diag_2'] >= 630 and row['diag_2'] <= 679): # complications of pregnancy, childbirth and the puerperium
        df.loc[i, 'diag_2'] = 11
    elif (row['diag_2'] >= 680 and row['diag_2'] <= 709): # diseases of the skin and subcutaneuous tissue
        df.loc[i, 'diag_2'] = 12
    elif (row['diag_2'] >= 710 and row['diag_2'] <= 739): # diseases of the muscuskeletal system and connective tissue
        df.loc[i, 'diag_2'] = 13
    elif (row['diag_2'] >= 740 and row['diag_2'] <= 759): # congenital anomalies
        df.loc[i, 'diag_2'] = 14
    elif (row['diag_2'] >= 760 and row['diag_2'] <= 779): # certain condtions originating in the perinatal period
        df.loc[i, 'diag_2'] = 15
    elif (row['diag_2'] >= 780 and row['diag_2'] <= 799): # symptoms, signs and ill-defined conditions
        df.loc[i, 'diag_2'] = 16
    elif (row['diag_2'] >= 800 and row['diag_2'] <= 999): # injury and poisoning
        df.loc[i, 'diag_2'] = 17

In [708]:
# Converting the third diagnosis


for i, row in df.iterrows():
    if (row['diag_3'] >= 1 and row['diag_3'] <= 139): # infectious and parasitic diseases
        df.loc[i, 'diag_3'] = 1
    elif (row['diag_3'] >= 140 and row['diag_3'] <= 239): # neoplasms
        df.loc[i, 'diag_3'] = 2
    elif (row['diag_3'] >= 240 and row['diag_3'] <= 279): # endocrine, nutritional, metabolic & immunity disorders
        df.loc[i, 'diag_3'] = 3
    elif (row['diag_3'] >= 280 and row['diag_3'] <= 289): # diseases of the blood and blood-forming organis
        df.loc[i, 'diag_3'] = 4
    elif (row['diag_3'] >= 290 and row['diag_3'] <= 319): # mental disorders
        df.loc[i, 'diag_3'] = 5
    elif (row['diag_3'] >= 320 and row['diag_3'] <= 389): # diseases of the nervous system and sense organs
        df.loc[i, 'diag_3'] = 6
    elif (row['diag_3'] >= 390 and row['diag_3'] <= 459): # diseases of the circulatory system
        df.loc[i, 'diag_3'] = 7
    elif (row['diag_3'] >= 460 and row['diag_3'] <= 519): # diseases of the respiratory system
        df.loc[i, 'diag_3'] = 8
    elif (row['diag_3'] >= 520 and row['diag_3'] <= 579): # diseases of the digestive system
        df.loc[i, 'diag_3'] = 9
    elif (row['diag_3'] >= 580 and row['diag_3'] <= 629): # diseases of the genitourinary system
        df.loc[i, 'diag_3'] = 10
    elif (row['diag_3'] >= 630 and row['diag_3'] <= 679): # complications of pregnancy, childbirth and the puerperium
        df.loc[i, 'diag_3'] = 11
    elif (row['diag_3'] >= 680 and row['diag_3'] <= 709): # diseases of the skin and subcutaneuous tissue
        df.loc[i, 'diag_3'] = 12
    elif (row['diag_3'] >= 710 and row['diag_3'] <= 739): # diseases of the muscuskeletal system and connective tissue
        df.loc[i, 'diag_3'] = 13
    elif (row['diag_3'] >= 740 and row['diag_3'] <= 759): # congenital anomalies
        df.loc[i, 'diag_3'] = 14
    elif (row['diag_3'] >= 760 and row['diag_3'] <= 779): # certain condtions originating in the perinatal period
        df.loc[i, 'diag_3'] = 15
    elif (row['diag_3'] >= 780 and row['diag_3'] <= 799): # symptoms, signs and ill-defined conditions
        df.loc[i, 'diag_3'] = 16
    elif (row['diag_3'] >= 800 and row['diag_3'] <= 999): # injury and poisoning
        df.loc[i, 'diag_3'] = 17

## Track the number of medications changed

In [709]:
medicine_columns = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone']

In [710]:
# Create a dataframe from the medication columns
medicine_df = df[medicine_columns]


In [711]:
# Define a function that returns 0 if there was no change and 1 if there was 
def is_changed(x):
    same = ['No', 'Steady']
    return 0 if x in same else 1

In [712]:
# Apply that function to the entire medicine df, creating a data frame of 1s and 0s

medicine_df_temp = medicine_df.apply(lambda x: [is_changed(i) for i in x])

# Create a new column that sums each row
medicine_df_temp['num_meds_changed'] = medicine_df_temp.apply(np.sum, axis=1)


In [713]:
# Validate values 

medicine_df_temp['num_meds_changed'].value_counts()

0    44479
1    13680
2      787
3       63
4        3
Name: num_meds_changed, dtype: int64

In [714]:
# Add num_meds_changed to original df

df['num_meds_changed'] = medicine_df_temp['num_meds_changed']

# Dropping the medicine columns and the change column as num_meds_changed is now measuring the same thing
df.drop(medicine_columns, axis=1, inplace=True)
df.drop('change', axis=1, inplace=True)

In [715]:
# Further validation
medicine_df_temp[medicine_df_temp['num_meds_changed']==4]

Unnamed: 0,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,...,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,num_meds_changed
70821,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,4
73369,1,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,4
96599,1,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,4


In [716]:
df.shape

(59012, 22)

## Track total number of visits in previous year

In [717]:
df['total_hosp_visits'] = df['number_emergency'] + df['number_inpatient'] + df['number_outpatient']

In [718]:
df.drop(['number_emergency', 'number_inpatient', 'number_outpatient'], axis=1, inplace=True)

In [719]:
df.shape

(59012, 20)

In [720]:
df.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'diabetesMed', 'readmitted', 'num_meds_changed', 'total_hosp_visits'],
      dtype='object')

## Encoding 

In [721]:
# Encoding age

age_groups = {
    "[0-10)": 0,
    "[10-20)": 1,
    "[20-30)":2,
    "[30-40)":3,
    "[40-50)":4,
    "[50-60)":5,
    "[60-70)":6,
    "[70-80)":7,
    "[80-90)":8,
    "[90-100)":9
}

df['age'] = df['age'].apply(lambda x: age_groups[x])

In [722]:
# Encoding gender

gender_groups = {
    "Female": 0,
    "Male": 1
}

df['gender'] = df['gender'].apply(lambda x: gender_groups[x] )

In [723]:
df.dtypes

race                         object
gender                        int64
age                           int64
admission_type_id             int64
discharge_disposition_id      int64
admission_source_id           int64
time_in_hospital              int64
num_lab_procedures            int64
num_procedures                int64
num_medications               int64
diag_1                      float64
diag_2                      float64
diag_3                      float64
number_diagnoses              int64
max_glu_serum                object
A1Cresult                    object
diabetesMed                  object
readmitted                   object
num_meds_changed              int64
total_hosp_visits             int64
dtype: object

In [724]:
# A1c test result

result_groups = {
    "None": 0,
    "Norm": 1,
    ">7":2,
    ">8":3
}

df['A1Cresult'] = df['A1Cresult'].apply(lambda x: result_groups[x])

In [725]:
# Any diabetes medications prescribed?

meds = {
    "No": 0,
    "Yes": 1
}

df['diabetesMed'] = df['diabetesMed'].apply(lambda x: meds[x])

In [726]:
# Glucose serum test amount

serum_amounts = {
    "None": 0,
    "Norm": 1,
    ">200": 2,
    ">300": 3
}


df['max_glu_serum'] = df['max_glu_serum'].apply(lambda x: serum_amounts[x])

In [727]:
# Was the patient readmitted? If so in how long?

readmitted = {
    "NO": 0,
    "<30": 1,
    ">30": 2
}

df['readmitted'] = df['readmitted'].apply(lambda x: readmitted[x])

## Binning Admission Source Ids

In [728]:
def encode_source_type(x):
    encode_dict = {
        1: "Referral",
        7: "Emergency Room",
    }
    
    return encode_dict[x] if x in encode_dict else "Other"

# Based on this https://www.health.ny.gov/statistics/sparcs/sysdoc/elements_837/source_of_admission.htm

In [729]:
df['admission_source_id'] = df['admission_source_id'].apply(lambda x: encode_source_type(x))

## Binning Admission Type Ids

In [730]:
def encode_type(x):
    
    encode_dict = {
        1: "Emergency",
        2: "Emergency",
        3: "Elective"
    }
    
    return encode_dict[x] if x in encode_dict else "Other"

In [731]:
df['admission_type_id'] = df['admission_type_id'].apply(lambda x: encode_type(x))

In [732]:
def encode_dispos(x):
    
    encode_dict = {
        1: "Discharged to Home"
    }
    
    return encode_dict[x] if x in encode_dict else "Other"

In [733]:
df['discharge_disposition_id'] = df['discharge_disposition_id'].apply(lambda x: encode_dispos(x))

## Turning categorical variables to dummy variables

In [734]:
object_columns = [i for i in df.columns if df[i].dtype == "object"]
dummy_columns = pd.get_dummies(df[object_columns])

df = pd.concat([df, dummy_columns], axis=1).drop(object_columns, axis=1)

In [735]:
df.shape

(59012, 29)

In [736]:
df.columns

Index(['gender', 'age', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'diabetesMed',
       'readmitted', 'num_meds_changed', 'total_hosp_visits',
       'race_AfricanAmerican', 'race_Asian', 'race_Caucasian', 'race_Hispanic',
       'race_Other', 'admission_type_id_Elective',
       'admission_type_id_Emergency', 'admission_type_id_Other',
       'discharge_disposition_id_Discharged to Home',
       'discharge_disposition_id_Other', 'admission_source_id_Emergency Room',
       'admission_source_id_Other', 'admission_source_id_Referral'],
      dtype='object')

## RFC Test

In [737]:
from sklearn.ensemble import RandomForestClassifier

test_df = df.copy()

X = test_df.drop('readmitted', axis=1).values
y = test_df['readmitted'].values

In [738]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

In [739]:
# from imblearn.combine import SMOTEENN
# smote_enn = SMOTEENN(random_state=0)
# X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

In [740]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [741]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=300, random_state=1, max_depth=100, bootstrap=True, max_features=3).fit(X_train_scaled, y_train)

In [743]:
from sklearn.metrics import accuracy_score

predictions = rf_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

acc_score

0.5971363212742523

In [744]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2"], columns=["Predicted 0", "Predicted 1", "Predicted 2"])

cm_df


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,6283,1,638
Actual 1,841,1,254
Actual 2,3018,3,764


In [745]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.62      0.91      0.74      6922
           1       0.20      0.00      0.00      1096
           2       0.46      0.20      0.28      3785

    accuracy                           0.60     11803
   macro avg       0.43      0.37      0.34     11803
weighted avg       0.53      0.60      0.52     11803



In [746]:
importances = rf_model.feature_importances_

sorted(zip(rf_model.feature_importances_, test_df.drop('readmitted', axis=1).columns), reverse=True)

[(0.1357480974985422, 'num_lab_procedures'),
 (0.1190981963668748, 'num_medications'),
 (0.08700798926983289, 'time_in_hospital'),
 (0.08143739603993423, 'diag_2'),
 (0.07936731422351631, 'diag_1'),
 (0.07860367939760604, 'diag_3'),
 (0.06852201608706288, 'age'),
 (0.05949836881636681, 'num_procedures'),
 (0.05887781776804857, 'number_diagnoses'),
 (0.04339913468496559, 'total_hosp_visits'),
 (0.026287924048556857, 'gender'),
 (0.026136987125071075, 'A1Cresult'),
 (0.022823679471580753, 'num_meds_changed'),
 (0.013903380625841913, 'diabetesMed'),
 (0.012583680218589246, 'race_Caucasian'),
 (0.011528381734523486, 'race_AfricanAmerican'),
 (0.009027859046709287, 'admission_source_id_Emergency Room'),
 (0.008717303628821301, 'admission_source_id_Referral'),
 (0.007810367647914496, 'admission_type_id_Emergency'),
 (0.007318926527531439, 'discharge_disposition_id_Discharged to Home'),
 (0.007247173188577783, 'discharge_disposition_id_Other'),
 (0.007129009316899024, 'max_glu_serum'),
 (0.00