In [1]:
# Import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import utils
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')
warnings.warn('DelftStack')
warnings.warn('Do not show this message')

In [2]:
#Import dataset
dd = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
dd.head()

In [3]:
# convert 0 values to meaning for all cloumns except 'Outcome', and 'Pregnancies'
for col in dd.drop(['Outcome', 'Pregnancies'], axis=1):
    val = dd[col].mean()
    dd[col] = dd[col].replace(0, val)

In [4]:
dd.head()

In [5]:
#check for missing values
dd.isnull().sum()

In [6]:
# outcome distribution
outcome = dd['Outcome'].value_counts().to_frame()
print(outcome)
outcome['Outcome'].plot(kind='pie',
                            figsize=(15, 6),
                            autopct='%1.1f%%', 
                            startangle=90, 
                            labels=None,         
                            pctdistance=1.12,    
                            )

plt.title('Outcome Distribution', y=1.2) 
plt.axis('equal') 
plt.legend(labels=outcome.index, loc='upper left') 
plt.show()

In [7]:
# find the mean values based on outcomes 
grouped = dd.groupby('Outcome').mean()
grouped

In [8]:
#plot corrlation using heatmap
plt.figure(figsize=(20,10))
sns.heatmap(dd.corr(),annot= True, linewidths=2, cmap = "YlGnBu")

In [9]:
plt.figure(figsize = (10,8))
sns.scatterplot(x = 'Glucose', y = 'BloodPressure', hue = "Outcome",
                data = dd, alpha = 1)
plt.ylim(40, )
plt.xlim(60, )

Findings:
1. Higher glucose levels can be a good indicator of diabetes.
2. High bloodpressure levels and High glucose levels can be a good indicator of diabetes together. 
3. High or low bloodpressure level with a glucose level below 100 can be a good indicator of not having diabetes. 

In [10]:
plt.figure(figsize = (10,8))
sns.scatterplot(x = 'BMI', y = 'Age', hue = "Outcome",
                data = dd, alpha = 1)
plt.xlim(15, 60)
plt.ylim(20,70)

Findings: 

1. Age by it self isn't the best indicator of diabetes. 
2. BMI is a strong indicator of diabetes in comparison to age. 
3. Also someone with an BMI of 25 to 30 and is above age 40 can still develop diabetes. 

In [11]:
plt.figure(figsize = (10,8))
sns.scatterplot(x = 'Glucose', y = 'Age', hue = "Outcome",
                data = dd, alpha = 1)
plt.ylim(20,70)

Finding: 
1. Higher glucose levels regardless of age increases chance of developing diabetes. 
2. Maintaining glucose levels below 100 can be a good preventative measure one can take to decrease chance of developing diabetes.

In [12]:
plt.figure(figsize = (10,8))
sns.scatterplot(x = 'Glucose', y = 'Age', hue = "Outcome",
                data = dd, alpha = 1)
plt.ylim(20,80)

In [13]:
print('Mean Value for Pregnancies',grouped['Pregnancies'])
plt.figure(figsize=(10,5))
sns.boxplot(y=dd['Pregnancies'],x=dd['Outcome'])
plt.tight_layout()
plt.show()

Finding:

In outcome 1 where person has Diabetes, the mean value of pregnancy is 4.865 and has a larger box plot. Indicating having more pregnancies can increase chance of developing diabetes. 


In [14]:
print('Mean Value for Insulin',grouped['Insulin'])
plt.figure(figsize=(10,5))
sns.boxplot(y=dd['Insulin'],x=dd['Outcome'])
plt.tight_layout()
plt.show()

In [15]:
plt.figure(figsize = (10,8))
sns.scatterplot(x='Glucose', y= 'Insulin', hue = "Outcome", data=dd)
plt.ylim(0,400)
plt.xlim(50,200)

Finding 

1. There seem to be a positive correlation between insilin and glucose.
2. When both insilin and glucose levels are high that shows a strong indication of diabetes.  

In [16]:
plt.figure(figsize = (10,8))
sns.scatterplot(x='BMI', y= 'Glucose', hue = "Outcome", data=dd)
plt.ylim(50,200)
plt.xlim(15,55)

Finding 
1. BMI seems to be not a reliable indicator of diabetes. Reason being Body mass index (BMI) is a measurement of your overall body mass but isn't a measurement of what the mass consist of. 
2. On the other hands glucose levels does seem to be a reliable indicator of diabetes.

In [17]:
# feature selection 
x = dd.drop(['Outcome',], axis=1)
y = dd['Outcome']

In [18]:
# feature scaling 
sc = StandardScaler()
X = sc.fit_transform(x)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.15,random_state = 5)

In [19]:
X_test.shape

In [20]:
# Build model, compile model, train and validate model preformace
model = tf.keras.Sequential([
  tf.keras.layers.Dense(300, input_dim=8, activation='relu'),
  tf.keras.layers.Dropout(.3),
  tf.keras.layers.Dense(150, activation='sigmoid'),
  tf.keras.layers.Dropout(.2),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
histo = model.fit(X_train, y_train, epochs=50, batch_size=64, verbose = 1, validation_data=(X_test, y_test))

In [21]:
# graph model accuracy of training and testing subsets 
plt.figure(figsize=(15,5))
plt.plot(histo.history['accuracy'])
plt.plot(histo.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.ylim(.6,)
plt.show()

In [22]:
# graph the loss of both training and testing subsets 
plt.figure(figsize=(15,5))
plt.plot(histo.history['loss']) 
plt.plot(histo.history['val_loss']) 
plt.title('Model loss') 
plt.ylabel('Loss') 
plt.xlabel('Epoch') 
plt.legend(['Train', 'Test'], loc='upper left') 
plt.show()

In [23]:
# Hyperparameter Tuning for classifier models 
model={
    'svm':{ 'model':SVC(),'params':{ 'C':[0.1,1, 10, 100],'gamma': [1,0.1,0.01,0.001],
                                    'kernel':['rbf', 'poly', 'sigmoid']}
    },
    'Randomforest':{ 'model':RandomForestClassifier(),'params':{'n_estimators':range(1,50),
                                                                'criterion':['gini','entropy']}
    },
    'Logistic':{'model':LogisticRegression(),'params':{ 'C': range(1,10)}
    },
    'decision_tree': {'model': DecisionTreeClassifier(),'params': { 'n_neighbors': range(1,50),
                                                                   'criterion': ['gini','entropy']}
    },
    'KNN':{'model':KNeighborsClassifier(),'params':{'n_neighbors' : range(1,50)}
    }
}

In [24]:
scores = []
for model_name, mp in model.items():
    clf =  GridSearchCV(mp['model'], mp['params'], return_train_score=False)

In [25]:
clf.fit(X_train, y_train)

In [26]:
scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
})
    
model_pref = pd.DataFrame(scores,columns=['model','best_params','best_score'])

In [27]:
# best model for hyperparameter tuning given the parameters above, 
model_pref

Now lets look and compare models without hyperparameter tuning.

In [28]:
reg = LogisticRegression()
reg.fit(X_train, y_train)   
y_pred=reg.predict(X_test)
print(accuracy_score(y_test,y_pred)*100)

In [29]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_ped2 = knn.predict(X_test)
print(accuracy_score(y_test,y_ped2)*100)

In [30]:
svm = SVC()
svm.fit(X_train, y_train)
y_ped3 = svm.predict(X_test)
print(accuracy_score(y_test,y_ped3)*100)

In [31]:
rfd = RandomForestClassifier()
rfd.fit(X_train, y_train)
y_ped4 = rfd.predict(X_test)
print(accuracy_score(y_test,y_ped4)*100)

In [32]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_ped5 = dtc.predict(X_test)
print(accuracy_score(y_test,y_ped5)*100)

After looking at the models above you might ask yourself why does the classifier models without hyperparameter 
tuning perform better than the models with parameter tuning?

The simple answer is the parameters you give to the GridSearchCV. GridSearchCV loops through predefined parameters and fit your estimators (models) on your training set. So, in the end, you select the best parameters from the predefined parameters you set.