In [206]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.models import model_from_json

In [207]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [208]:
df.shape

(768, 9)

In [209]:
y = df['Outcome']
df.drop('Outcome', axis=1, inplace=True)
X = df
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [210]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 50)

# this function helps in plotting the accuracy of the models
name_arr = []
acc_arr = []
def lst_append(name, acc):
  name_arr.append(name)
  acc_arr.append(acc)

model_lst = []  
name_lst = []
def model_perf(model, name):
  model.fit(X_train, y_train)
  model_lst.append(model)
  name_lst.append(name)
  preds = model.predict(X_test)
  acc = round(accuracy_score(y_test, preds), 5)*100
  acc_lst = lst_append(name, acc)
  print('======================================')
  print('Accuracy ' + f'{name}' + ':', acc,'\n')
  print(classification_report(y_test, preds))
  print('======================================')

Log_Reg = LogisticRegression()
model_perf(Log_Reg, 'Logistic Regression')

RFC = RandomForestClassifier()
model_perf(RFC, 'Random Forest')

SGDC = SGDClassifier()
model_perf(SGDC, 'Stochastic Gradient Descent')

svm = SVC()
model_perf(svm, 'Support Vector Machine')

LSVC = LinearSVC()
model_perf(LSVC, 'Linear Support Vector Classification')

GNB = GaussianNB()
model_perf(GNB, 'Naive Bayes')

BNB = BernoulliNB()
model_perf(BNB, 'Bernoulli Naive Bayes')

MNB = MultinomialNB()
model_perf(MNB, 'Multinomial Naive Bayes')

AdaB = AdaBoostClassifier()
model_perf(AdaB, 'AdaBoost')

LGBM = LGBMClassifier()
model_perf(LGBM, 'Light Gradient Boosting Machine')

GBC = GradientBoostingClassifier()
model_perf(GBC, 'Gradient Boost Classifier')

Accuracy Logistic Regression: 72.727 

              precision    recall  f1-score   support

           0       0.76      0.85      0.80       101
           1       0.63      0.49      0.55        53

    accuracy                           0.73       154
   macro avg       0.70      0.67      0.68       154
weighted avg       0.72      0.73      0.72       154

Accuracy Random Forest: 72.727 

              precision    recall  f1-score   support

           0       0.75      0.87      0.81       101
           1       0.65      0.45      0.53        53

    accuracy                           0.73       154
   macro avg       0.70      0.66      0.67       154
weighted avg       0.72      0.73      0.71       154

Accuracy Stochastic Gradient Descent: 65.584 

              precision    recall  f1-score   support

           0       0.66      0.99      0.79       101
           1       0.50      0.02      0.04        53

    accuracy                           0.66       154
   macro 

In [211]:
dict2 = {'Name of the model': name_arr, 'Accuracy': acc_arr}
perf = pd.DataFrame(dict2)
fig = px.bar(perf, x='Name of the model', y='Accuracy')
fig.show()

In [212]:
name_arr1 = []
acc_arr1 = []

def lst_append(name, acc):
  name_arr1.append(name)
  acc_arr1.append(acc)

def train_model(param_dict, model, name):
    grid = GridSearchCV(estimator=model, param_grid=param_dict, n_jobs=-1); 
    grid.fit(X_train, y_train) 
    print(grid.best_params_) 
    param_dict[model] = [grid.best_params_]
    preds = grid.predict(X_test)
    acc = round(accuracy_score(y_test, preds), 5)*100
    acc_lst = lst_append(name, acc)
    print('======================================')
    print('Accuracy ' + f'{name}' + ':', acc,'\n')
    print(classification_report(y_test, preds))
    print('======================================')

In [213]:
LogReg = LogisticRegression()
LogReg_params = {
  'C':[0.1, 1, 10], 
  'penalty':['l1', 'l2', 'elasticnet'], 
  'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
  'n_jobs':[-1], 'max_iter':[100, 150, 200, 250, 500]}
train_model(LogReg_params, LogReg, 'Logistic Regression')

RFC1 = RandomForestClassifier()
RFC1_params = {
  'n_estimators': [100, 150, 200, 250, 500], 
  'criterion': ['gini', 'entropy'], 
  'max_features': [3,5,7, 'auto', 'sqrt', 'log2']}
train_model(RFC1_params, RFC1, 'Random Forest Classifier')

SGDC1 = SGDClassifier()
SGDC1_params = {
  'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 
  'penalty':['l1', 'l2', 'elasticnet'], 
  'alpha':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 
  'n_jobs':[-1]}
train_model(SGDC1_params, SGDC1, 'SGDC1')

L_SVC = LinearSVC()
L_SVC_params = {
  'penalty':['l1', 'l2'], 
  'loss':['hinge', 'squared_hinge'], 
  'C':[1, 10, 100], 
  'multi_class':['ovr', 'crammer_singer']}
train_model(L_SVC_params, L_SVC, 'L_SVC')

MNB1 = MultinomialNB()
MNB1_params = {
  'alpha':[1, 10], 
  'fit_prior':['True', 'False']}
train_model(MNB1_params, MNB1, 'MNB')

AdaB1 = AdaBoostClassifier()
AdaB1_params = {
  'n_estimators':[1, 10, 50, 100, 200, 500, 1000], 
  'algorithm':['SAMME', 'SAMME.R']}
train_model(AdaB1_params, AdaB1, 'AdaB1')

GBC1 = GradientBoostingClassifier()
GBC1_params = {
  'loss':['deviance', 'exponential'], 
  'learning_rate':[0.1, 1, 10, 100], 
  'n_estimators':[1, 10, 100, 1000], 
  'criterion':['friedman_mse', 'squared_mse', 'mse', 'mae']}
train_model(GBC1_params, GBC1, 'GBC1')

{'C': 1, 'max_iter': 100, 'n_jobs': -1, 'penalty': 'l2', 'solver': 'newton-cg'}
Accuracy Logistic Regression: 73.37700000000001 

              precision    recall  f1-score   support

           0       0.76      0.87      0.81       101
           1       0.66      0.47      0.55        53

    accuracy                           0.73       154
   macro avg       0.71      0.67      0.68       154
weighted avg       0.72      0.73      0.72       154

{'criterion': 'gini', 'max_features': 3, 'n_estimators': 100}
Accuracy Random Forest Classifier: 68.182 

              precision    recall  f1-score   support

           0       0.73      0.82      0.77       101
           1       0.55      0.42      0.47        53

    accuracy                           0.68       154
   macro avg       0.64      0.62      0.62       154
weighted avg       0.67      0.68      0.67       154

{'alpha': 1000, 'loss': 'squared_hinge', 'n_jobs': -1, 'penalty': 'l2'}
Accuracy SGDC1: 73.37700000000001 

  

In [214]:
# svm1 = SVC()
# param_grid = {'C': [0.1, 1, 10], 'gamma':['scale', 'auto'], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}  
# train_model(param_grid, svm1, 'Support Vector Machine')

In [215]:
dict3 = {'Name of the model': name_arr1, 'Accuracy': acc_arr1}
perf = pd.DataFrame(dict3)
fig1 = px.bar(perf, x='Name of the model', y='Accuracy')
fig1.show()

In [216]:
Model = ['Logistic Regression', 'Random Forest Classifier', 'Stochastic Gradient Descent', 
        'Linear Support Vector Classification', 'Multinomial Naive Bayes',  
        'AdaBoost', 'Gradient Boost Classifier']
Accuracy = list(dict2['Accuracy'])
Accuracy1 = list(dict3['Accuracy'])

fig = go.Figure(data=[
    go.Bar(name='Normal', x=Model, y=[Accuracy[0], Accuracy[1], Accuracy[2], Accuracy[4], Accuracy[7], Accuracy[8], Accuracy[10]]),
    go.Bar(name='Hyperparameter tuned', x=Model, y=[Accuracy1[0], Accuracy1[1], Accuracy1[2], Accuracy1[3], Accuracy1[4], Accuracy1[5], Accuracy1[6]])
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.update_layout(title_text='Comparing the Normally trained models to the Hyperparameter tuned models')
fig.show()

In [217]:
LogReg = LogisticRegression(C=1, max_iter=100, n_jobs=-1, penalty='l2', solver='newton-cg')
model_perf(LogReg, 'Logistic Regression (Hyperparameter tuned)')

RFC1 = RandomForestClassifier(class_weight='balanced_subsample', criterion='entropy', max_features=5, n_estimators=250)
model_perf(RFC1, 'Random Forest Classifier (Hyperparameter tuned)')

SGDC1 = SGDClassifier(alpha=100, loss='squared_hinge', n_jobs=-1, penalty='elasticnet')
model_perf(SGDC1, 'Stochastic Gradient Descent (Hyperparameter tuned)')

L_SVC = LinearSVC(C=1, loss='squared_hinge', multi_class='crammer_singer', penalty='l1')
model_perf(L_SVC, 'Linear Support Vector Machine')

MNB1 = MultinomialNB(alpha=1, fit_prior=True)
model_perf(MNB1, 'Multinomial Naive Bayes (Hyperparameter tuned)')

AdaB1 = AdaBoostClassifier(algorithm='SAMME', n_estimators=200)
model_perf(AdaB1, 'AdaBoost Classifier (Hyperparameter tuned)')

GBC1 = GradientBoostingClassifier(criterion='mse', learning_rate=1, loss='exponential', n_estimators=10)
model_perf(GBC1, 'Gradient Boosting Classifier (Hyperparameter tuned)')

Accuracy Logistic Regression (Hyperparameter tuned): 73.37700000000001 

              precision    recall  f1-score   support

           0       0.76      0.87      0.81       101
           1       0.66      0.47      0.55        53

    accuracy                           0.73       154
   macro avg       0.71      0.67      0.68       154
weighted avg       0.72      0.73      0.72       154

Accuracy Random Forest Classifier (Hyperparameter tuned): 73.37700000000001 

              precision    recall  f1-score   support

           0       0.75      0.88      0.81       101
           1       0.67      0.45      0.54        53

    accuracy                           0.73       154
   macro avg       0.71      0.67      0.68       154
weighted avg       0.72      0.73      0.72       154

Accuracy Stochastic Gradient Descent (Hyperparameter tuned): 69.48100000000001 

              precision    recall  f1-score   support

           0       0.75      0.81      0.78       101
     

In [218]:
network = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(1, 8)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(2, activation='sigmoid')
])

network.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

network.fit(X_train, y_train, epochs=100, batch_size=10)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1635f8de3d0>

In [219]:
network.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_6 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dense_18 (Dense)             (None, 128)               1152      
_________________________________________________________________
dense_19 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_20 (Dense)             (None, 2)                 130       
Total params: 9,538
Trainable params: 9,538
Non-trainable params: 0
_________________________________________________________________


In [None]:
for


In [220]:
network.evaluate(X_test, y_test)



[0.7778156995773315, 0.6688311696052551]

In [221]:
df1 = pd.DataFrame()
df1['Pregnancies'] = [6]
df1['Glucose'] = [148]
df1['BloodPressure'] = [72]
df1['SkinThickness'] = [35]
df1['Insulin'] = [0]
df1['BMI'] = [33.6]
df1['DiabetesPedigreeFunction'] = [0.627]
df1['Age'] = [50]
df1

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50


In [222]:
model_lst1 = [svm, GNB, BNB, LGBM, LogReg, RFC1, SGDC1, L_SVC, MNB, AdaB1, GBC1]

name_lst1 = ['Support Vector Machine', 'Naive Bayes',
 'Bernoulli Naive Bayes', 
 'Light Gradient Boosting Machine', 
 'Logistic Regression (Hyperparameter tuned)',
 'Random Forest Classifier (Hyperparameter tuned)',
 'Stochastic Gradient Descent (Hyperparameter tuned)',
 'Linear Support Vector Machine (Hyperparameter tuned)',
 'Multinomial Naive Bayes (Hyperparameter tuned)',
 'AdaBoost Classifier (Hyperparameter tuned)',
 'Gradient Boosting Classifier (Hyperparameter tuned)']

lst1 = []
for i in model_lst1:
    lst1.append(i.predict(df1))
m = network.predict(df1)
lst1.append(np.where(m[0] == max(m[0])))
if lst1.count(1)>lst1.count(0):
    print('Patient is likely to be diagnosed with Diabetes')
else:
    print('Patient is unlikely to be diagnosed with Diabetes') 

Patient is likely to be diagnosed with Diabetes


In [223]:
import pickle

for (model, name) in zip(model_lst1, name_lst1):
    filename = name+'.pkl'
    pickle.dump(model, open(filename, 'wb')) 

In [224]:
import h5py

network_json = network.to_json()
with open("network.json", "w") as json_file:
    json_file.write(network_json)
network.save_weights("network.h5")
print("Saved model to disk")

# load json and create model
json_file = open('network.json', 'r')
loaded_network_json = json_file.read()
json_file.close()
loaded_network = model_from_json(loaded_network_json)
# load weights into new model
loaded_network.load_weights("network.h5")
print("Loaded model from disk")

# evaluate loaded model on test data
loaded_network.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
score = loaded_network.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (loaded_network.metrics_names[1], score[1]*100))

Saved model to disk
Loaded model from disk
accuracy: 66.88%
