# XGBoost Demonstration

Link: https://youtu.be/C1ahSmQalZY

## Import Libraies

In [74]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## import various models to do the comparison

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

## Data preparation

In [76]:
df = pd.read_csv('dataset/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


**Inference**

- All the features except Outcome are independent variables.
- Outcome is the dependent/target variable.
- Outcome column represents whether a patient has diabetes or not.
- 1 means diabetic, 0 means non-diabetic.

<b>Split the data into independent and dependent features<b>

In [77]:
X = df.iloc[:,0:8] # independent variables
y = df.iloc[:,8] # dependent variable

In [78]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [79]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

<b>Split data into train and test set<b>

In [80]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [81]:
X_train.shape

(614, 8)

In [82]:
y_train.shape

(614,)

## Runing Various Models

### 1. Logistic Regression

In [83]:
from sklearn.linear_model import LogisticRegression

In [84]:
log_regressor = LogisticRegression()
log_regressor

LogisticRegression()

<b>Training the model<b>

In [85]:
model = log_regressor.fit(X_train, y_train)
model

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

<b>prediction<b>

In [86]:
y_prediction = model.predict(X_test)
y_prediction

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
      dtype=int64)

<b>Model evaluation<b>

In [87]:
from sklearn.metrics import accuracy_score

In [88]:
accuracy = accuracy_score(y_test, y_prediction)
print(f'Accuracy of Logistic regression model: {np.round(accuracy*100, 2)}%')

Accuracy of Logistic regression model: 82.47%


### 2. K Nearest Neighbors Classifier

In [89]:
from sklearn.neighbors import KNeighborsClassifier

In [90]:
knn=KNeighborsClassifier(n_neighbors=1)
knn

KNeighborsClassifier(n_neighbors=1)

### Model training, prediction and evaluation

In [91]:
knn_model = knn.fit(X_train, y_train)
knn_model

KNeighborsClassifier(n_neighbors=1)

In [92]:
prediction = knn_model.predict(X_test)
prediction

array([1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [93]:
from sklearn.metrics import accuracy_score, f1_score

In [94]:
accuracy = accuracy_score(y_test, prediction)
print(f'Accuracy of KNN model: {np.round(accuracy*100, 2)}%')

Accuracy of KNN model: 61.69%


### 3. SVM

In [95]:
from sklearn.svm import SVC

In [96]:
classifier = SVC(kernel='rbf')
classifier

SVC()

### Model training, prediction and evaluation

In [97]:
model = classifier.fit(X_train, y_train)
model

SVC()

In [98]:
prediction = model.predict(X_test)
prediction

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [99]:
from sklearn.metrics import accuracy_score, f1_score

In [100]:
accuracy = accuracy_score(y_test, prediction)
print(f'Accuracy of SVM model: {np.round(accuracy*100, 2)}%')

Accuracy of SVM model: 79.22%


### 4. XGBOOST CLASSIFIER

In [101]:
from xgboost import XGBClassifier

<b>Split into independent and dependent features<b>

In [102]:
X = df.iloc[:,0:8] # independent variables
y = df.iloc[:,8] # dependent variable

<b>Split data into train and test set<b>

In [103]:
from sklearn.model_selection import train_test_split
seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [104]:
X_train.shape

(614, 8)

In [105]:
X_test.shape

(154, 8)

<b>Training the XGBOOST model<b>

The XGBoost model for classification is called XGBClassifier. We can create and fit it to our training dataset. Models are fit using the scikit-learn API and the model.fit() function.

In [106]:
model = XGBClassifier()
model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

<b>make predicitons with XGBOOST model<b>

In [107]:
y_prediction = model.predict(X_test)
y_prediction

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
      dtype=int64)

<b>Evaluate model predictions<b>

In [108]:
from sklearn.metrics import accuracy_score, f1_score
accuracy = accuracy_score(y_test, y_prediction)
print(f'Accuracy of XGBoost model: {np.round(accuracy*100, 2)}%')

Accuracy of XGBoost model: 75.32%


**Inference**

- Accuracy of Logistic Regression model is 82%.
- Accuracy of KNN model is 61%.
- Accuracy of SVM model is 79%.
- Accuracy of XGBOOST model is 75%.

### Passing parameters to the XGBOOST model

#### 1. ``eta`` parameter

``eta`` : how aggressively or how slowly i want to update the last value. default value is 0.3


In [109]:
model = XGBClassifier(eta=0.3)
model.fit(X_train, y_train)
prediction = model.predict(X_test)
accuracy = accuracy_score(y_test, prediction)





In [110]:
print(f'Accuracy of XGBoost model: {np.round(accuracy*100, 2)}%')

Accuracy of XGBoost model: 75.32%


In [111]:
model = XGBClassifier(eta=0.01) # set eta value to 0.01
model.fit(X_train, y_train)
prediction = model.predict(X_test)
accuracy = accuracy_score(y_test, prediction)





In [112]:
print(f'Accuracy of XGBoost model: {np.round(accuracy*100, 2)}%')

Accuracy of XGBoost model: 81.17%


**Inference:**

When we set parameter ``eta`` value is set to 0.01, the accuracy is around 81%. When we set parameter ``eta`` value to 0.3, the accuracy is around 75%.

#### 2. ``gamma`` parameter

- ``gamma`` is by default 0. job of gamma is to control overfitting of the model. 
- If we increase gamma, for instance to 50, the trees inside the xgboost will not grow after a certain limit. Means we are pruning the tree. Thus we are reducing overfitting. But accuracy comes down in this case.



In [113]:
model = XGBClassifier(eta=0.01, gamma=50) # set gamma to 50
model.fit(X_train, y_train)
prediction = model.predict(X_test)
accuracy = accuracy_score(y_test, prediction)





In [114]:
print(f'Accuracy of XGBoost model: {np.round(accuracy*100, 2)}%')

Accuracy of XGBoost model: 75.97%


Inference:

Accuracy comes down to 75% from 81% after increasing gamma to 50.

In [115]:
model = XGBClassifier(eta=0.01, gamma=10) # set gamma value to 10
model.fit(X_train, y_train)
prediction = model.predict(X_test)
accuracy = accuracy_score(y_test, prediction)





In [116]:
print(f'Accuracy of XGBoost model: {np.round(accuracy*100, 2)}%')

Accuracy of XGBoost model: 78.57%


Inference:

* Accuracy increased  to 78% from 75% after reducing gamma value to 10 from 50.
* Thus as gamma value reduces, the accuracy increases and vice versa.
* Bust when high gamma value is used, we reduce the model overfitting.
