In [15]:
import os
import pickle
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, classification_report

In [16]:
import plotly.graph_objs as go
from plotly.offline import iplot

import matplotlib.pyplot as plt

In [17]:
# Read Dataset
data = pd.read_csv('../Datasets/diabetic_data.csv')

In [18]:
# Do OneHot Encoding
# List of columns that we do not need to one hot encode
no_OH = ['encounter_id', 'patient_nbr', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient',
         'number_emergency', 'number_inpatient', 'number_diagnoses', 'medical_specialty', 'payer_code', 'readmitted']

X_continuous = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient',
         'number_emergency', 'number_inpatient', 'number_diagnoses']

In [19]:
# Do OneHot encoding of remaining columns
OH = OneHotEncoder()

X_OH = data.drop(no_OH, axis=1)
OH.fit(X_OH)
X_OH = OH.transform(X_OH)
X_OH_df = pd.DataFrame(X_OH.toarray(), columns=OH.get_feature_names())

In [20]:
pickle.dump(OH, open('OH_Encoder.pkl', 'wb'))

In [21]:
sscaler = MinMaxScaler().fit(data[X_continuous])
X_normed = sscaler.transform(data[X_continuous])
X_normed = pd.DataFrame(X_normed, columns=X_continuous)

In [22]:
pickle.dump(sscaler, open('scaler.pkl', 'wb'))

In [23]:
final_df = pd.concat([X_OH_df, X_normed], axis=1)

In [24]:
labels = data['readmitted'].apply(lambda val: 0 if val=='NO' else 1)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(final_df, labels, test_size=0.2)

In [26]:
# Train a random forest model

classifier = RandomForestClassifier(n_jobs=6)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

# Print results
acc = accuracy_score(y_true=y_test, y_pred=predictions)
print('Acc score of RF Classifier model is: {}'.format(acc))
f1 = f1_score(y_true=y_test, y_pred=predictions, average='macro')
print('F1 score of RF Classifier model is: {}'.format(f1))
prec = precision_score(y_true=y_test, y_pred=predictions, average='macro')
print('Prec score of RF Classifier model is: {}'.format(prec))
recall = recall_score(y_true=y_test, y_pred=predictions, average='macro')
print('Recall score of RF Classifier model is: {}'.format(recall))
print(classification_report(y_true=y_test, y_pred=predictions))

Acc score of RF Classifier model is: 0.644885526186499
F1 score of RF Classifier model is: 0.635606484172873
Prec score of RF Classifier model is: 0.6440806153201699
Recall score of RF Classifier model is: 0.6363995613711011
              precision    recall  f1-score   support

           0       0.65      0.75      0.69     10956
           1       0.64      0.53      0.58      9398

    accuracy                           0.64     20354
   macro avg       0.64      0.64      0.64     20354
weighted avg       0.64      0.64      0.64     20354



In [27]:
pickle.dump(classifier, open('rf_model.pkl', 'wb'))

In [28]:
fi = pd.DataFrame(sorted(zip(final_df.columns, classifier.feature_importances_), key=lambda x: x[1], reverse=True)[:20])
fi.columns = ['feature', 'importance']

# Make plots
bar = go.Bar(x=fi.feature.values, 
             y=fi.importance.values,
             opacity=0.5,
             marker=dict(color="#007dcc", line=dict(color="#002f4c", width=1.5))
            )

layout = go.Layout(title='Random Forest Feature Importances',
                   xaxis=dict(tickangle=45),
                   yaxis=dict(title='Feature Importance'))

fig = go.Figure(data=[bar], layout=layout)
iplot(fig)

### Train AdaBoostClassifier

In [29]:
classifier = AdaBoostClassifier()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

# Print results
f1 = f1_score(y_true=y_test, y_pred=predictions, average='macro')
print('F1 score of AdaBoost Classifier model is: {}'.format(f1))
prec = precision_score(y_true=y_test, y_pred=predictions, average='macro')
print('Prec score of AdaBoost Classifier model is: {}'.format(prec))
recall = recall_score(y_true=y_test, y_pred=predictions, average='macro')
print('Recall score of AdaBoost Classifier model is: {}'.format(recall))
acc = accuracy_score(y_true=y_test, y_pred=predictions)
print('Acc score of AdaBoost Classifier model is: {}'.format(acc))
print(classification_report(y_true=y_test, y_pred=predictions))

# Save model
pickle.dump(classifier, open('../Models/adaboost_model.pkl', 'wb'))

F1 score of AdaBoost Classifier model is: 0.6277432588159284
Prec score of AdaBoost Classifier model is: 0.6384015662037288
Recall score of AdaBoost Classifier model is: 0.6292725701700183
Acc score of AdaBoost Classifier model is: 0.6387442271789329
              precision    recall  f1-score   support

           0       0.64      0.75      0.69     10956
           1       0.64      0.51      0.56      9398

    accuracy                           0.64     20354
   macro avg       0.64      0.63      0.63     20354
weighted avg       0.64      0.64      0.63     20354



### Logistic Regression

In [30]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

# Print results
f1 = f1_score(y_true=y_test, y_pred=predictions, average='macro')
print('F1 score of Logistic Classifier model is: {}'.format(f1))
prec = precision_score(y_true=y_test, y_pred=predictions, average='macro')
print('Prec score of Logistic Classifier model is: {}'.format(prec))
recall = recall_score(y_true=y_test, y_pred=predictions, average='macro')
print('Recall score of Logistic Classifier model is: {}'.format(recall))
acc = accuracy_score(y_true=y_test, y_pred=predictions)
print('Acc score of Logistic Classifier model is: {}'.format(acc))
print(classification_report(y_true=y_test, y_pred=predictions))

# Save model
pickle.dump(classifier, open('../Models/log_reg_model.pkl', 'wb'))

F1 score of Logistic Classifier model is: 0.633417496172554
Prec score of Logistic Classifier model is: 0.6429811409652676
Recall score of Logistic Classifier model is: 0.6345086570041507
Acc score of Logistic Classifier model is: 0.6434607448167436
              precision    recall  f1-score   support

           0       0.64      0.75      0.69     10956
           1       0.64      0.52      0.57      9398

    accuracy                           0.64     20354
   macro avg       0.64      0.63      0.63     20354
weighted avg       0.64      0.64      0.64     20354




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



### Neural Network Classifier

In [31]:
classifier = MLPClassifier(hidden_layer_sizes=(10,))
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

# Print results
f1 = f1_score(y_true=y_test, y_pred=predictions, average='macro')
print('F1 score of MLP Classifier model is: {}'.format(f1))
prec = precision_score(y_true=y_test, y_pred=predictions, average='macro')
print('Prec score of MLP Classifier model is: {}'.format(prec))
recall = recall_score(y_true=y_test, y_pred=predictions, average='macro')
print('Recall score of MLP Classifier model is: {}'.format(recall))
acc = accuracy_score(y_true=y_test, y_pred=predictions)
print('Acc score of MLP Classifier model is: {}'.format(acc))
print(classification_report(y_true=y_test, y_pred=predictions))

# Save model
pickle.dump(classifier, open('../Models/nn_model.pkl', 'wb'))

F1 score of MLP Classifier model is: 0.6184112239202084
Prec score of MLP Classifier model is: 0.6216363256276412
Recall score of MLP Classifier model is: 0.6184744588833385
Acc score of MLP Classifier model is: 0.6245946742655006
              precision    recall  f1-score   support

           0       0.64      0.70      0.67     10956
           1       0.61      0.54      0.57      9398

    accuracy                           0.62     20354
   macro avg       0.62      0.62      0.62     20354
weighted avg       0.62      0.62      0.62     20354




Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.



### Gaussian Naive Bayes

In [32]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

# Print results
f1 = f1_score(y_true=y_test, y_pred=predictions, average='macro')
print('F1 score of GaussianNB Classifier model is: {}'.format(f1))
prec = precision_score(y_true=y_test, y_pred=predictions, average='macro')
print('Prec score of GaussianNB Classifier model is: {}'.format(prec))
recall = recall_score(y_true=y_test, y_pred=predictions, average='macro')
print('Recall score of GaussianNB Classifier model is: {}'.format(recall))
acc = accuracy_score(y_true=y_test, y_pred=predictions)
print('Acc score of GaussianNB Classifier model is: {}'.format(acc))
print(classification_report(y_true=y_test, y_pred=predictions))

# Save model
pickle.dump(classifier, open('../Models/gnb_model.pkl', 'wb'))

F1 score of GaussianNB Classifier model is: 0.4003574818835807
Prec score of GaussianNB Classifier model is: 0.6071139460954091
Recall score of GaussianNB Classifier model is: 0.5270012220135548
Acc score of GaussianNB Classifier model is: 0.49371130981625233
              precision    recall  f1-score   support

           0       0.74      0.09      0.16     10956
           1       0.48      0.96      0.64      9398

    accuracy                           0.49     20354
   macro avg       0.61      0.53      0.40     20354
weighted avg       0.62      0.49      0.38     20354



### Decision Tree

In [33]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

# Print results
f1 = f1_score(y_true=y_test, y_pred=predictions, average='macro')
print('F1 score of DTree Classifier model is: {}'.format(f1))
prec = precision_score(y_true=y_test, y_pred=predictions, average='macro')
print('Prec score of DTree Classifier model is: {}'.format(prec))
recall = recall_score(y_true=y_test, y_pred=predictions, average='macro')
print('Recall score of DTree Classifier model is: {}'.format(recall))
acc = accuracy_score(y_true=y_test, y_pred=predictions)
print('Acc score of DTree Classifier model is: {}'.format(acc))
print(classification_report(y_true=y_test, y_pred=predictions))

# Save model
pickle.dump(classifier, open('../Models/dtree_model.pkl', 'wb'))

F1 score of DTree Classifier model is: 0.5740185202937516
Prec score of DTree Classifier model is: 0.5740780105840358
Recall score of DTree Classifier model is: 0.5739787780035385
Acc score of DTree Classifier model is: 0.5767908027906062
              precision    recall  f1-score   support

           0       0.61      0.61      0.61     10956
           1       0.54      0.54      0.54      9398

    accuracy                           0.58     20354
   macro avg       0.57      0.57      0.57     20354
weighted avg       0.58      0.58      0.58     20354



In [34]:
# Save the train and test datasets
X_train.to_csv('../Datasets/X_train.csv')
y_train.to_csv('../Datasets/y_train.csv')
X_test.to_csv('../Datasets/X_test.csv')
y_test.to_csv('../Datasets/y_test.csv')