# Load initial packages

In [None]:
from preprocess import PreProcess, Scale
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


# Raw Data

In [7]:
pd.read_csv('diabetic_data.csv').head(5)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


# Processed data

In [8]:
x = PreProcess('diabetic_data.csv')
df = x.processed()
df.head()

Unnamed: 0,Hispanic,Caucasian,Asian,Other,AfricanAmerican,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,diabetesMed,readmitted
0,0.0,0.0,1.0,0.0,0.0,1,1,1,41,0,1,0,0,0,1,0,0
1,0.0,0.0,1.0,0.0,0.0,1,2,3,59,0,18,0,0,0,9,1,0
2,0.0,0.0,0.0,0.0,1.0,1,3,2,11,5,13,2,0,1,6,1,0
3,0.0,0.0,1.0,0.0,0.0,0,4,2,44,1,16,0,0,0,7,1,0
4,0.0,0.0,1.0,0.0,0.0,0,5,1,51,0,8,0,0,0,5,1,0


# Check for null values

In [9]:
df.columns[df.isna().any()].tolist()

[]

# Split into test/train, optional oversample to balance data

In [10]:
train, test = np.split(df.sample(frac=1), [int(0.75*len(df))])

train, X_train, y_train = Scale(train, scale=False, oversample=False)
test, X_test, y_test = Scale(test, scale=False, oversample=False)

# Decision tree with and without Grid search hyperparameter tuning

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

model = DecisionTreeClassifier()

params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}

clf = GridSearchCV(model, param_grid=params, cv=3, verbose=1)
clf.fit(X_train, y_train)
clf.best_estimator_
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

dtc = DecisionTreeClassifier()

dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88     22097
           1       0.14      0.17      0.15      2776

    accuracy                           0.79     24873
   macro avg       0.51      0.52      0.52     24873
weighted avg       0.81      0.79      0.80     24873

[[19190  2907]
 [ 2313   463]]


In [None]:
X_test[0]

# Logistic regression

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

log_model = LogisticRegression(max_iter=5000)

log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     22097
           1       0.54      0.01      0.03      2776

    accuracy                           0.89     24873
   macro avg       0.71      0.51      0.48     24873
weighted avg       0.85      0.89      0.84     24873

[[22064    33]
 [ 2738    38]]


# K nearest neighbors

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     22097
           1       0.42      0.01      0.02      2776

    accuracy                           0.89     24873
   macro avg       0.66      0.50      0.48     24873
weighted avg       0.84      0.89      0.84     24873

[[22067    30]
 [ 2754    22]]


# Random forest classifer with and without hyperparameter tuning

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     22097
           1       0.36      0.02      0.03      2776

    accuracy                           0.89     24873
   macro avg       0.63      0.51      0.48     24873
weighted avg       0.83      0.89      0.84     24873

[[22022    75]
 [ 2733    43]]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.metrics import confusion_matrix, classification_report

rf = RandomForestClassifier()

param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist, 
                                 n_iter=5, 
                                 cv=5)

np.random.shuffle(X_train)
np.random.shuffle(y_train)
rand_search.fit(X_train[:10000], y_train[:10000])
y_pred = rand_search.predict(X_test)

best_rf = rand_search.best_estimator_

print('Best hyperparameters:',  rand_search.best_params_, best_rf)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))