In [1]:
#importing libraries into python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sqlalchemy import create_engine
import re
import warnings
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn import ensemble
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score,auc,roc_curve,f1_score,roc_auc_score, precision_score, recall_score, classification_report, confusion_matrix
from scipy.stats import chi2_contingency
from sklearn.neural_network import MLPClassifier
from dask.distributed import Client, progress
import dask.dataframe as dd
import joblib
from dask_ml.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [55]:
client = Client(n_workers=4, threads_per_worker=2, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:53668  Dashboard: http://127.0.0.1:53667/status,Cluster  Workers: 4  Cores: 8  Memory: 8.00 GB


In [3]:
#load dataset into python
df = pd.read_csv('diabetic_data.csv')

In [4]:
#all the missing values in the data set are question marks. Replaces all ? with null values
df.replace('?', np.nan,inplace=True)
df.replace('Unknown/Invalid', np.nan,inplace=True)

In [5]:
#dropping columns that are 'ID' 
df = df.drop(['encounter_id', 'patient_nbr','admission_type_id', 'discharge_disposition_id',
         'admission_source_id'], axis=1)

#dropping columns that have high number of missing data
df = df.drop(['weight', 'payer_code', 'medical_specialty'], axis=1)

#drop rows with null values
df = df.dropna(axis=0)

In [6]:
#creating diabetes primary feature
diagnosis_col = ['diag_1', 'diag_2', 'diag_3']
for col in diagnosis_col:
    df[col] = df[col].str.replace('E', '')
    df[col] = df[col].str.replace('V', '')
    df.loc[df[col].str.contains('250'), col] = '250'

#turning diagnosis columns from integer into general diagnosis
for col in diagnosis_col:
    df.loc[df[col] == '250', col] = 'Diabetes'
    df.loc[((df[col] >= '390') & (df[col] <= '459')) | (df[col] == '785'), col] = 'Circulatory'
    df.loc[((df[col] >= '460') & (df[col] <= '519')) | (df[col] == '786'), col] = 'Respiratory'
    df.loc[((df[col] >= '520') & (df[col] <= '579')) | (df[col] == '787'), col] = 'Digestive'
    df.loc[((df[col] >= '800') & (df[col] <= '999')), col] = 'Injury/Poison'
    df.loc[((df[col] >= '710') & (df[col] <= '739')), col] = 'Musculoskeletal'
    df.loc[((df[col] >= '580') & (df[col] <= '629')) | (df[col] == '788'), col] = 'Genitourinary'
    df.loc[((df[col] >= '140') & (df[col] <= '239')), col] = 'Neoplasm'
    df[col] = df[col].where(df[col].str.isalpha(), 'Other')

In [7]:
#Will only be keeping insulin. all other medications are too skewed/dont occur enough
medication_list = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 
                   'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone',
                   'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin',
                   'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone',
                   'metformin-rosiglitazone', 'metformin-pioglitazone']

#creates a feature that combines how long a patient has stayed in the hospital previously
df['time_in_hospital_past'] = df['number_inpatient'] + df['number_emergency'] + df['number_outpatient']
med_vals = ['Steady', 'Up', 'Down']

#creates a feature that shows how many diabetes medication the patient is on
df['number_diabetes_meds'] = np.nan
for i in range(len(df)):
    num = 0
    for med in medication_list:
        if df[med].iloc[i] in med_vals:
            num += 1
    df['number_diabetes_meds'].iloc[i] = num

In [8]:
#data cleaning
df.loc[df['readmitted'] == '>30', 'readmitted'] = 'Yes'
df.loc[df['readmitted'] == '<30', 'readmitted'] = 'Yes'
df.loc[df['A1Cresult'] == '>8', 'A1Cresult'] = 'Taken'
df.loc[df['A1Cresult'] == '>7', 'A1Cresult'] = 'Taken'
df.loc[df['A1Cresult'] == 'Norm', 'A1Cresult'] = 'Taken'
df.loc[df['A1Cresult'] == 'None', 'A1Cresult'] = 'Not Taken'
df.loc[df['change'] == 'Ch', 'change'] = 'yes'
df.loc[df['age'] == '[0-10)', 'age'] = '0-29'
df.loc[df['age'] == '[10-20)', 'age'] = '0-29'
df.loc[df['age'] == '[20-30)', 'age'] = '0-29'
df.loc[df['age'] == '[30-40)', 'age'] = '30-59'
df.loc[df['age'] == '[40-50)', 'age'] = '30-59'
df.loc[df['age'] == '[50-60)', 'age'] = '30-59'
df.loc[df['age'] == '[60-70)', 'age'] = '60-99'
df.loc[df['age'] == '[70-80)', 'age'] = '60-99'
df.loc[df['age'] == '[80-90)', 'age'] = '60-99'
df.loc[df['age'] == '[90-100)', 'age'] = '60-99'


numeric_data = df.select_dtypes(include=[np.number])
categorical_data = df.select_dtypes(exclude=[np.number])

In [9]:
#drops all medcations except insulin. and other     
df = df.drop(['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 
              'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone',
              'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
              'glyburide-metformin', 'glipizide-metformin','glimepiride-pioglitazone',
              'metformin-rosiglitazone', 'metformin-pioglitazone','diag_2', 'diag_3', 'max_glu_serum', 
              'race','diabetesMed','number_inpatient', 'number_outpatient', 'number_emergency','age'], axis=1)

In [10]:
col_to_dummy = ['gender','change','insulin','A1Cresult','diag_1']

numeric_features = ['num_lab_procedures','number_diabetes_meds','time_in_hospital_past',
              'time_in_hospital', 'num_procedures', 'num_medications',
              'number_diagnoses']

final_df = df.copy()

final_df = pd.concat([df, pd.get_dummies(df[col_to_dummy], drop_first=True)], axis=1)
final_df = final_df.drop(col_to_dummy, axis=1)

In [11]:
final_df = dd.from_pandas(final_df, npartitions=4)
final_df.compute()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,readmitted,time_in_hospital_past,number_diabetes_meds,gender_Male,change_yes,...,insulin_Steady,insulin_Up,A1Cresult_Taken,diag_1_Diabetes,diag_1_Digestive,diag_1_Genitourinary,diag_1_Musculoskeletal,diag_1_Neoplasm,diag_1_Other,diag_1_Respiratory
1,3,59,0,18,9,Yes,0,1.0,0,1,...,0,1,0,0,0,0,0,0,1,0
2,2,11,5,13,6,NO,3,1.0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2,44,1,16,7,NO,0,1.0,1,1,...,0,1,0,0,0,0,0,0,1,0
4,1,51,0,8,5,NO,0,2.0,1,1,...,1,0,0,0,0,0,0,1,0,0
5,3,31,6,16,9,Yes,0,1.0,1,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,3,51,0,16,9,Yes,0,2.0,1,1,...,0,0,1,1,0,0,0,0,0,0
101762,5,33,3,18,9,NO,1,1.0,0,0,...,1,0,0,0,1,0,0,0,0,0
101763,1,53,0,9,13,NO,1,2.0,1,1,...,0,0,0,0,0,0,0,0,1,0
101764,10,45,2,21,9,NO,1,3.0,0,1,...,0,1,0,0,0,0,0,0,1,0


In [12]:
y = final_df.readmitted
x = final_df.drop('readmitted', axis=1)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=30)

In [14]:
x_train.persist()
x_test.persist()
y_train.persist()
y_test.persist()

Dask Series Structure:
npartitions=4
1         object
25575        ...
51370        ...
76380        ...
101765       ...
Name: readmitted, dtype: object
Dask Name: split, 4 tasks

In [15]:
from dask_ml.preprocessing import StandardScaler
scaler = StandardScaler()


x_train[numeric_features] = scaler.fit_transform(x_train[numeric_features])
x_test[numeric_features] = scaler.transform(x_test[numeric_features])

In [16]:
x_train.compute()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,time_in_hospital_past,number_diabetes_meds,gender_Male,change_yes,insulin_No,insulin_Steady,insulin_Up,A1Cresult_Taken,diag_1_Diabetes,diag_1_Digestive,diag_1_Genitourinary,diag_1_Musculoskeletal,diag_1_Neoplasm,diag_1_Other,diag_1_Respiratory
1,-0.477565,0.800595,-0.791821,0.228720,0.812541,-0.529134,-0.194540,0,1,0,0,1,0,0,0,0,0,0,1,0
2,-0.811207,-1.628682,2.134700,-0.386544,-0.822606,0.774052,-0.194540,0,0,1,0,0,0,0,0,0,0,0,1,0
3,-0.811207,0.041446,-0.206516,-0.017386,-0.277557,-0.529134,-0.194540,1,1,0,0,1,0,0,0,0,0,0,1,0
4,-1.144850,0.395715,-0.791821,-1.001808,-1.367655,-0.529134,0.889761,1,1,0,1,0,0,0,0,0,0,1,0,0
5,-0.477565,-0.616483,2.720004,-0.017386,0.812541,-0.529134,-0.194540,1,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101760,0.523363,0.092056,-0.206516,1.090090,0.812541,2.077239,0.889761,0,1,0,0,0,0,0,0,0,0,0,1,0
101761,-0.477565,0.395715,-0.791821,-0.017386,0.812541,-0.529134,0.889761,1,1,0,0,0,1,1,0,0,0,0,0,0
101762,0.189720,-0.515263,0.964092,0.228720,0.812541,-0.094739,-0.194540,0,0,0,1,0,0,0,1,0,0,0,0,0
101763,-1.144850,0.496935,-0.791821,-0.878755,2.992737,-0.094739,0.889761,1,1,0,0,0,0,0,0,0,0,0,1,0


In [17]:
neighbors = KNeighborsClassifier(n_neighbors=5)

with joblib.parallel_backend('dask'):
    neighbors.fit(x_train.compute(), y_train.compute())
    neighbors_score = neighbors.score(x_test.compute(),y_test.compute())
    neighbors_cv = cross_val_score(neighbors, x.compute(), y.compute(), cv=5)

In [18]:
print('Accuracy score: ', neighbors_score)
print('CV score: ', neighbors_cv)

Accuracy score:  0.5656956610307599
CV score:  [0.50456377 0.5577482  0.54783274 0.55935747 0.56236614]


In [19]:
neighbors_weighted = KNeighborsClassifier(n_neighbors=313, weights = 'distance')

with joblib.parallel_backend('dask'):
    neighbors_weighted.fit(x_train.compute(), y_train.compute())
    neighbors_weighted_score = neighbors_weighted.score(x_test.compute(),y_test.compute())
    neighbors_weighted_cv = cross_val_score(neighbors_weighted, x.compute(), y.compute(), cv=5)

In [24]:
%%time
print('Accuracy score: ', neighbors_weighted_score)
print('CV score: ', neighbors_weighted_cv)

Accuracy score:  0.6088307190873905
CV score:  [0.52552139 0.6000204  0.58220296 0.60249873 0.60790413]
CPU times: user 945 µs, sys: 610 µs, total: 1.56 ms
Wall time: 1.08 ms


In [27]:
svm = SVC()


In [28]:
%%time

with joblib.parallel_backend('dask'):
    svm.fit(x_train.compute(), y_train.compute())
    svm_score = svm.score(x_test.compute(), y_test.compute())
    svm_cv = cross_val_score(svm,x.compute(),y.compute(),cv=5)
    y_pred_svm = svm.predict(x_test.compute())
    


CPU times: user 12min 23s, sys: 13.5 s, total: 12min 36s
Wall time: 21min 3s


In [36]:
print('Accuracy score: ', svm_score)
print('CV score: ', svm_cv)

Accuracy score:  0.6229374618048482
CV score:  [0.58043955 0.5969099  0.58021418 0.61351351 0.61922489]


In [29]:
print("Accuracy is {0:.2f}".format(accuracy_score(y_test.compute(), y_pred_svm)))
print(classification_report(y_test, y_pred_svm))

Accuracy is 0.62
              precision    recall  f1-score   support

          NO       0.62      0.74      0.68     10387
         Yes       0.63      0.49      0.55      9249

    accuracy                           0.62     19636
   macro avg       0.62      0.62      0.61     19636
weighted avg       0.62      0.62      0.62     19636



In [56]:
#boosted classifier
params = {'n_estimators': 100}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)

In [61]:
%%time

with joblib.parallel_backend('dask'):
    clf.fit(x_train.compute(), y_train.compute())
    clf_score = clf.score(x_train.compute(), y_train.compute())
    clf_cv = cross_val_score(clf,x.compute(),y.compute(),cv=5)

CPU times: user 14.3 s, sys: 827 ms, total: 15.1 s
Wall time: 29.5 s


In [65]:
print('Accuracy score: ', clf_score)
print('CV score: ', clf_cv)

Accuracy score:  0.622066925117323
CV score:  [0.61898934 0.61526694 0.60015298 0.62437532 0.627231  ]


In [66]:
feature_importances = pd.DataFrame(clf.feature_importances_,
                                   index = x_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

                        importance
time_in_hospital_past     0.699170
number_diagnoses          0.067719
num_medications           0.043004
num_lab_procedures        0.037242
num_procedures            0.034116
number_diabetes_meds      0.033938
time_in_hospital          0.028129
diag_1_Neoplasm           0.017593
diag_1_Diabetes           0.012636
insulin_Steady            0.007679
diag_1_Other              0.005544
A1Cresult_Taken           0.003054
diag_1_Musculoskeletal    0.002595
diag_1_Respiratory        0.002333
diag_1_Genitourinary      0.001655
gender_Male               0.001421
diag_1_Digestive          0.000922
insulin_Up                0.000574
insulin_No                0.000489
change_yes                0.000185


In [67]:
y_pred_clf = clf.predict(x_test.compute())
print("Accuracy is {0:.2f}".format(accuracy_score(y_test.compute(), y_pred_clf)))
print(classification_report(y_test.compute(), y_pred_clf))

Accuracy is 0.62
              precision    recall  f1-score   support

          NO       0.62      0.74      0.67     10387
         Yes       0.63      0.49      0.55      9249

    accuracy                           0.62     19636
   macro avg       0.62      0.61      0.61     19636
weighted avg       0.62      0.62      0.62     19636



In [109]:
params = {'n_estimators': [100,300],
          'max_features':[3,5,7],}

clf_hp = ensemble.GradientBoostingClassifier()
grid_search = GridSearchCV(estimator = clf_hp, param_grid = params, 
                          cv = 5, n_jobs = -1, verbose = 2,scoring='roc_auc')

In [110]:
%%time

with joblib.parallel_backend('dask'):
    grid_search.fit(x_train.compute(), y_train.compute())

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend DaskDistributedBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   55.4s finished


CPU times: user 52.7 s, sys: 4.5 s, total: 57.2 s
Wall time: 3min 47s


In [112]:
clf = ensemble.GradientBoostingClassifier(**grid_search.best_params_)

with joblib.parallel_backend('dask'):
    clf.fit(x_train.compute(), y_train.compute())
    clf_score = clf.score(x_train.compute(), y_train.compute())
    clf_cv = cross_val_score(clf,x.compute(),y.compute(),cv=5)

In [114]:
print('Accuracy score: ', clf_score)
print('CV score: ', clf_cv)
feature_importances = pd.DataFrame(clf.feature_importances_,
                                   index = x_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

Accuracy score:  0.6253188124872475
CV score:  [0.61419611 0.61751058 0.6027027  0.63074962 0.62707802]
                        importance
time_in_hospital_past     0.605764
number_diagnoses          0.084716
num_medications           0.063158
num_lab_procedures        0.048949
time_in_hospital          0.043043
number_diabetes_meds      0.042469
num_procedures            0.036423
diag_1_Neoplasm           0.018508
diag_1_Diabetes           0.015006
insulin_Steady            0.006375
A1Cresult_Taken           0.006097
diag_1_Other              0.005386
change_yes                0.004206
diag_1_Musculoskeletal    0.003837
insulin_Up                0.003457
diag_1_Respiratory        0.003346
gender_Male               0.002849
diag_1_Genitourinary      0.002563
diag_1_Digestive          0.002028
insulin_No                0.001820


In [115]:
y_pred_clf = clf.predict(x_test.compute())
print("Accuracy is {0:.2f}".format(accuracy_score(y_test.compute(), y_pred_clf)))
print(classification_report(y_test.compute(), y_pred_clf))

Accuracy is 0.62
              precision    recall  f1-score   support

          NO       0.62      0.74      0.67     10387
         Yes       0.63      0.49      0.55      9249

    accuracy                           0.62     19636
   macro avg       0.62      0.62      0.61     19636
weighted avg       0.62      0.62      0.62     19636

