###  Predict the diabetes status of a female patient given (label) their health measurements

- Attribute Information:

   -  pregnant -  Number of times pregnant 
   -  glucose  -  Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
   -  bp       -  Diastolic blood pressure (mm Hg) 
   -  skin     -  Triceps skin fold thickness (mm) 
   -  insulin  -  2-Hour serum insulin (mu U/ml) 
   -  bmi      -  Body mass index (weight in kg/(height in m)^2) 
   -  pedigree -  Diabetes pedigree function 
   -  age      -  Age (years) 
   -  diabetes - Class variable (0 or 1) 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics

%matplotlib inline

In [2]:
# Gradient boosting
from sklearn.ensemble import GradientBoostingClassifier


In [3]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'diabetes']
pima_data = pd.read_csv('C:\\Users\\jp\\Desktop\\testData\\pima-indians-diabetes.csv', header=None, names=col_names)

In [4]:
pima_data.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
pima_data.shape

(768, 9)

In [5]:
# define features x and resopnse y
X = pima_data[['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age']]
y = pima_data['diabetes']

In [6]:
from sklearn.model_selection import train_test_split

# split train and test data
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=1 )

In [7]:
gb_model = GradientBoostingClassifier()
gb_model

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [8]:
gb_model.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [9]:
y_pred = gb_model.predict(x_test)
metrics.accuracy_score(y_pred, y_test)

0.78645833333333337

In [13]:
# SVM Model tuning Gridsearch
gb_model = GradientBoostingClassifier()

param_grid = { "n_estimators"      : [100],             
               "max_features"      : [3, 4, 5],     
               "min_samples_leaf"  : [10, 15],
               "learning_rate"     :[0.1, 0.2, 0.3]
                }

optimized_gb = GridSearchCV(gb_model, param_grid, scoring='accuracy', cv=5 )
optimized_gb.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100], 'max_features': [3, 4, 5], 'min_samples_leaf': [10, 15], 'learning_rate': [0.1, 0.2, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [15]:
optimized_gb.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=3, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=15,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [16]:
optimized_gb.best_score_

0.75520833333333337

In [17]:
gb_model_final = GradientBoostingClassifier(learning_rate=0.1, criterion='friedman_mse', n_estimators=100, min_samples_leaf=15 )
gb_model_final.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=15,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [18]:
y_pred = gb_model_final.predict(x_test)
metrics.accuracy_score(y_pred, y_test)

0.80729166666666663