# Using XG Boost

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('diabetes.csv')
X = df[['Glucose', 'BloodPressure']]
y = df['Outcome']
X_sc = StandardScaler()
X = X_sc.fit_transform(X)
X = pd.DataFrame(data=X, columns=['Glucose', 'BloodPressure'])

## Train Test Split

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =42)

***

## Performance Function

In [47]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [48]:
def print_score(clf, X_train, X_test, y_train, y_test, train = True):
#     print accuracy score, classification report, confusion metrics
    if train:
#         training performance
        print('Train Result : \n')
        print('Accuracy Score {0:.4f}\n'.format(accuracy_score(y_train, clf.predict(X_train))))
        print('Classification Report : \n {} \n'.format(classification_report(y_train, clf.predict(X_train))))
        print('Confusion Metrics : \n {} \n'.format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv = 10, scoring='accuracy')
        print('Average Accuracy : {0:.4f}\n'.format(np.mean(res)))
        print('Accuracy SD : {0:.4f}\n'.format(np.std(res)))
        
    elif train == False:
#         test performance
        print('Test Result : \n')
        print('Accuracy Score {0:.4f}\n'.format(accuracy_score(y_test, clf.predict(X_test))))
        print('Classification Report : \n {}\n'.format(classification_report(y_test, clf.predict(X_test))))
        print('Confusion Metrics : \n {} \n'.format(confusion_matrix(y_test, clf.predict(X_test))))
        

## Designing model

In [49]:
import xgboost as xgb

In [50]:
clf = xgb.XGBClassifier(max_depth=5, n_estimators=1000, learning_rate=0.3)

In [51]:
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.3, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [52]:
print_score(clf, X_train, X_test, y_train, y_test, train = True)

Train Result : 

Accuracy Score 0.9544

Classification Report : 
              precision    recall  f1-score   support

          0       0.95      0.98      0.97       401
          1       0.96      0.91      0.93       213

avg / total       0.95      0.95      0.95       614
 

Confusion Metrics : 
 [[393   8]
 [ 20 193]] 

Average Accuracy : 0.6941

Accuracy SD : 0.0663



In [53]:
print_score(clf, X_train, X_test, y_train, y_test, train = False)

Test Result : 

Accuracy Score 0.6883

Classification Report : 
              precision    recall  f1-score   support

          0       0.77      0.73      0.75        99
          1       0.56      0.62      0.59        55

avg / total       0.70      0.69      0.69       154


Confusion Metrics : 
 [[72 27]
 [21 34]] 

