In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# Data exploration
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
diabetes=pd.read_csv("../input/diabetes.csv")

In [None]:
diabetes.info()

In [None]:
diabetes.head()

In [None]:
print("Diabetes data set dimensions : {}".format(diabetes.shape))

In [None]:
diabetes.groupby("Outcome").size()

In [None]:
diabetes.isnull().sum()
diabetes.isna().sum()

In [None]:
plt.hist(diabetes.BloodPressure)
plt.xlabel("Blood Pressure Level")
plt.show()

In [None]:
#looks like there are people with 0 blood pressure levels..identify them as those could be wrong observations(zero BP for a living person??)
diabetes[diabetes.BloodPressure==0].Outcome.value_counts()   

In [None]:
plt.hist(diabetes.Glucose)
plt.xlabel("Glucose Level")
plt.show()

In [None]:
# zero glucose levels????identify those as they could be wrong observations...
diabetes[diabetes.Glucose==0].Outcome.value_counts()   

In [None]:
#Skin fold thickness. For normal people skin fold thickness can’t be less than 10 mm better yet zero. Identify those.
plt.hist(diabetes.SkinThickness)
plt.xlabel("Skin Thickness")
plt.show()


In [None]:
# zero skin thickness levels????identify those as they could be wrong observations...
diabetes[diabetes.SkinThickness==0].Outcome.value_counts()   

In [None]:
#BMI. Identify any anomalies
plt.hist(diabetes.BMI)
plt.xlabel("BMI")
plt.show()

In [None]:
#BMI; should NOT be 0 for a living person
diabetes[diabetes.BMI==0].Outcome.value_counts()   

In [None]:
# Insulin. anomaly identification
plt.hist(diabetes.Insulin)
plt.xlabel("Insulin")
plt.show()

In [None]:
# Looks like some zeros for insulin as well. Identify those
diabetes[diabetes.Insulin==0].Outcome.value_counts()   

**Here are several ways to handle invalid data values :
**
Ignore/remove these cases : This is not actually possible in most cases because that would mean losing valuable information. And in this case “skin thickness” and “insulin” columns means have a lot of invalid points. But it might work for “BMI”, “glucose ”and “blood pressure” data points.
Put average/mean values : This might work for some data sets, but in our case putting a mean value to the blood pressure column would send a wrong signal to the model.
Avoid using features : It is possible to not use the features with a lot of invalid values for the model. This may work for “skin thickness” but its hard to predict that.

In [None]:
# remove rows for which the “BloodPressure”, “BMI” and “Glucose” are zero.
diabetes_mod = diabetes[(diabetes.BloodPressure != 0) & (diabetes.BMI != 0) & (diabetes.Glucose != 0)]
print(diabetes_mod.shape)


In [None]:
feature_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = diabetes_mod[feature_names]
y = diabetes_mod.Outcome

In [None]:
# let us start with model selection first here. Since this is a classification problem, import all those relevant models/.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Let us use default parameters and initialize the models accordingly
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVC', SVC()))
models.append(('LR', LogisticRegression()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))

#avoid training and testing on the same data as the goal of model is to predict out of sample data. Henc, follow:
1. Train/Test split
2. K-Fold Cross validation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [None]:
# Train/test using stratify 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = diabetes_mod.Outcome, random_state=0)

In [None]:
y.value_counts(normalize=True)

In [None]:
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

**# as you can observe...proportion of 0 and 1 of outcome remains the same even after split due to the fact that we used:
stratify = diabetes_mod.Outcome during train_test_split**

In [None]:
names = []
scores = []
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    names.append(name)
tr_split = pd.DataFrame({'Name': names, 'Score': scores})
print(tr_split)

In [None]:
# sort them by descending order of accuracy score
tr_split.sort_values(by="Score",ascending=False)

In [None]:
# K fold cross validation ---perfect way of measuring accuracy of models
from sklearn.model_selection import KFold
names = []
scores = []
for name, model in models:
    
    kfold = KFold(n_splits=10, random_state=10) 
    score = cross_val_score(model, X, y, cv=kfold, scoring='accuracy').mean()
    
    
    names.append(name)
    scores.append(score)
kf_cross_val = pd.DataFrame({'Name': names, 'Score': scores})
print(kf_cross_val)

In [None]:
# sort them by descending order of accuracy score
kf_cross_val.sort_values(by="Score",ascending=False)

In [None]:
plt.bar(kf_cross_val.Name,kf_cross_val.Score)
plt.xlabel("Model name")
plt.xlabel("cross validation accuracy score")
plt.show()

**Looks like Gradient Boosting, Logistic regression performed better than others it seems**

***Let us explore more into feature engineeing and hyper parameter tuning to achieve more accuracy*****

# Feature selection/engineering: let us apply that on logistic regression: Methods that we can use on this:

1. Univariate feature selection:  Selecting features that have the strongest relationship with  output variable
2. Recursive Feature elimination: works by recursively removing attributes and building a model on those attributes that remain. It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute.

3.  PCA: Principal component analysis

4. Feature importance: ensemble models like Random Forest and Extra Trees can be used to estimate the importance of features


In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
strat_k_fold=StratifiedKFold(n_splits=10)
logreg_model = LogisticRegression()
rfecv=RFECV(logreg_model,step=1,cv=strat_k_fold,scoring="accuracy")
rfecv.fit(X,y)

In [None]:
# print the feature indices being selected
print(rfecv.get_support(indices=True))
print(rfecv.get_support(indices=False))

In [None]:
rfecv.grid_scores_

In [None]:
plt.figure()
plt.title('Logistic Regression CV score vs No of Features')
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
print("Number of best features: ",rfecv.n_features_)

In [None]:
# Features and their ranking
rfecv.ranking_

In [None]:
#feature indices with best ranking. In other words, identify those feature indices first, followed by their names
best_feature_indices=np.where(rfecv.ranking_==1)
best_feature_names=X.columns[best_feature_indices]
print("Best feature names: ",best_feature_names)  #features most suitable for predicting the outcome variable

****Let us do a comparison of the model with original features vs new best features****

In [None]:
X_new=diabetes_mod[best_feature_names]
initial_score = cross_val_score(logreg_model, X, y, cv=strat_k_fold, scoring='accuracy').mean()
print("Initial accuracy : {} ".format(initial_score))

fe_score = cross_val_score(logreg_model, X_new, y, cv=strat_k_fold, scoring='accuracy').mean()
print("Accuracy after Feature Selection : {} ".format(fe_score))

## there seems to be a slight increase in accuracy after selecting the best features. All features: 0.7764 & best features only: 0.78058

In [None]:
# Let us apply the same concent to next model :gradient boosting as well
gb_model = GradientBoostingClassifier()
rfecv_gb=RFECV(gb_model,step=1,cv=strat_k_fold,scoring="accuracy")
rfecv_gb.fit(X,y)

In [None]:
print("Number of Best selected features: ",rfecv_gb.n_features_)
print("Best Features' ranks: ",rfecv_gb.ranking_)
print("Best Features' indices: ",rfecv_gb.get_support(indices=True))
print("Accuracy scores of features selected: ",rfecv_gb.grid_scores_)

It looks like it selected 6 features and the score says:0.78197606 for 6 features together
 

In [None]:
gb_best_features=X.columns[rfecv_gb.get_support(indices=True)]

In [None]:
gb_best_features

In [None]:
X_gb=diabetes_mod[gb_best_features]

In [None]:
# Gradeint Boost -accuracy with all features
initial_score = cross_val_score(gb_model, X, y, cv=strat_k_fold, scoring='accuracy').mean()
print("Initial accuracy : {} ".format(initial_score))

# Gradeint Boost -accuracy 6 best features
after_score = cross_val_score(gb_model, X_gb, y, cv=strat_k_fold, scoring='accuracy').mean()
print("Post accuracy : {} ".format(after_score))

## It seems like Gradient Boost accuracy is slightly better than Logistic regression model after recursive feature elimination process. So, let's go for it. Now, let us do hyperparameter tuning to optimize the model.

In [None]:
#import gridsearch model
from sklearn.model_selection import GridSearchCV

In [None]:
params= {'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200)}
grid = GridSearchCV(estimator = gb_model,param_grid = params, scoring='accuracy',n_jobs=-1,iid=False,cv=strat_k_fold)
grid.fit(X_gb,y)
grid.best_estimator_,grid.best_params_,grid.best_score_

# as we can observe, there is a very slight increase in accuracy score. not much though.