# Introduction

 <div class="alert alert-warning">
<font color=black>

**What?** XGBoost hyperparameters
    
</font>
</div>

# Import python modules

In [52]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split

# Load the dataset

<div class="alert alert-info">
<font color=black>

- Classification problem 
- has heart desease = **1**
- does not have heart desease = **0**

</font>
</div>

In [4]:
df = pd.read_csv('../DATASETS/heart_disease.csv')
df.head(3)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [8]:
# Split data into X and y
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Obtain a baseline score

<div class="alert alert-info">
<font color=black>

- We'll use two diffrerent ways to split the data. The second strategy, i.e. stratified fold, includes the
same percentage of target values in each fold.

</font>
</div>

In [None]:
model = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2)

In [18]:
# Obtain scores of cross-validation
scores = cross_val_score(model, X, y, cv=5)
# Display accuracy
print('Accuracy:', np.round(scores, 2))
# Display mean accuracy
print('Accuracy mean: %0.2f' % (scores.mean()))

Accuracy: [0.84 0.85 0.82 0.8  0.77]
Accuracy mean: 0.81


In [23]:
(df.iloc[:, -1]==0.0).value_counts()

False    165
True     138
Name: target, dtype: int64

<div class="alert alert-info">
<font color=black>

- The dataset are a bit unbalance and the a stratified approach may be a better option. 

</font>
</div>

In [19]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
# Obtain scores of cross-validation
scores = cross_val_score(model, X, y, cv=kfold)
# Display accuracy
print('Accuracy:', np.round(scores, 2))
# Display mean accuracy
print('Accuracy mean: %0.2f' % (scores.mean()))

Accuracy: [0.72 0.82 0.75 0.8  0.82]
Accuracy mean: 0.78


<div class="alert alert-info">
<font color=black>

- **The score has gone down. What does this mean?** It's important not to become too invested in obtaining the highest possible score. In this case, we trained 
the same XGBClassifier model on different folds and obtained different scores. This shows the importance of
being consistent with test folds when training models, and why the score is not necessarily the most important
thing. 
- Although when choosing between models, obtaining the best possible score is an optimal strategy, the 
difference in scores here reveals that the model is not necessarily better. In this case, the two models have 
the same hyperparameters, and the difference in scores is attributed to the different folds.
- The point here is to use the same folds to obtain new scores when fine-tuning hyperparameters

</font>
</div>

# Perform hyperparameters tuning

<div class="alert alert-info">
<font color=black>

- **GridSearchCV** searches all possible combinations in a hyperparameter grid to find the best results. 
- **RandomizedSearchCV** selects 10 random hyperparameter combinations by default. You can change this by utsing the `n_iter` parameter. RandomizedSearchCV is typically used when GridSearchCV becomes unwieldy because there are too many hyperparameter combinations
to exhaustively check each one.  

</font>
</div>

In [25]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)

In [49]:
def grid_search(params, random=False): 
    
    # Create model
    xgb = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2)    
    # Create folds -> take of sample subdivision to get average
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
    
    # Chose the values to be searched in each folder selected above
    if random:
        grid = RandomizedSearchCV(xgb, params, cv=kfold, n_iter=20, n_jobs=-1, random_state=2)
    else:
        # Instantiate GridSearchCV as grid_reg
        grid = GridSearchCV(xgb, params, cv=kfold, n_jobs=-1)
    
    # Fit grid_reg on X_train and y_train
    grid.fit(X, y)
    # Extract best params
    best_params = grid.best_params_
    # Print best params
    print("Best params:", best_params)    
    # Compute best score
    best_score = grid.best_score_
    # Print best score
    print("Best score: {:.5f}".format(best_score))

<div class="alert alert-info">
<font color=black>

-  What followa is a **change-one-but-keep-others-constant** approach.

</font>
</div>

In [51]:
grid_search(params={'n_estimators':[100, 200, 400, 800]})

Best params: {'n_estimators': 100}
Best score: 0.78235


In [28]:
grid_search(params={'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]})

Best params: {'learning_rate': 0.05}
Best score: 0.79585


In [34]:
grid_search(params={'max_depth':[2, 3, 5, 6, 8]})

Best params: {'max_depth': 2}
Best score: 0.79902


In [35]:
grid_search(params={'gamma':[0, 0.01, 0.1, 0.5, 1, 2]})

Best params: {'gamma': 0.5}
Best score: 0.79574


In [36]:
grid_search(params={'min_child_weight':[1, 2, 3, 4, 5]})

Best params: {'min_child_weight': 5}
Best score: 0.81219


In [37]:
grid_search(params={'subsample':[0.5, 0.7, 0.8, 0.9, 1]})

Best params: {'subsample': 0.8}
Best score: 0.79579


In [38]:
grid_search(params={'colsample_bytree':[0.5, 0.7, 0.8, 0.9, 1]})

Best params: {'colsample_bytree': 0.7}
Best score: 0.79902


# Applying early stopping

<div class="alert alert-info">
<font color=black>

- Early stopping provides a limit to the number of rounds that iterative machine learning algorithms train on.
commonly 'error' for classification, and 'rmse' for regression. 

</font>
</div>

In [53]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [55]:
model = XGBClassifier(booster='gbtree', objective='binary:logistic', random_state=2)
eval_set = [(X_test, y_test)]
eval_metric='error'
model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set)
# make predictions for test data
y_pred = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

[0]	validation_0-error:0.15790
[1]	validation_0-error:0.10526
[2]	validation_0-error:0.11842
[3]	validation_0-error:0.13158
[4]	validation_0-error:0.11842
[5]	validation_0-error:0.14474
[6]	validation_0-error:0.14474
[7]	validation_0-error:0.14474
[8]	validation_0-error:0.14474
[9]	validation_0-error:0.14474
[10]	validation_0-error:0.14474
[11]	validation_0-error:0.15790
[12]	validation_0-error:0.15790
[13]	validation_0-error:0.17105
[14]	validation_0-error:0.17105
[15]	validation_0-error:0.17105
[16]	validation_0-error:0.15790
[17]	validation_0-error:0.17105
[18]	validation_0-error:0.15790
[19]	validation_0-error:0.17105
[20]	validation_0-error:0.17105
[21]	validation_0-error:0.17105
[22]	validation_0-error:0.18421
[23]	validation_0-error:0.18421
[24]	validation_0-error:0.17105
[25]	validation_0-error:0.18421
[26]	validation_0-error:0.18421
[27]	validation_0-error:0.18421
[28]	validation_0-error:0.18421
[29]	validation_0-error:0.18421
[30]	validation_0-error:0.18421
[31]	validation_0-

In [56]:
model = XGBClassifier(
    booster='gbtree', objective='binary:logistic', random_state=2)
eval_set = [(X_test, y_test)]
eval_metric = "error"
model.fit(X_train, y_train, eval_metric=eval_metric,
          eval_set=eval_set, early_stopping_rounds=10, verbose=True)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

[0]	validation_0-error:0.15790
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.10526
[2]	validation_0-error:0.11842
[3]	validation_0-error:0.13158
[4]	validation_0-error:0.11842
[5]	validation_0-error:0.14474
[6]	validation_0-error:0.14474
[7]	validation_0-error:0.14474
[8]	validation_0-error:0.14474
[9]	validation_0-error:0.14474
[10]	validation_0-error:0.14474
[11]	validation_0-error:0.15790
Stopping. Best iteration:
[1]	validation_0-error:0.10526

Accuracy: 89.47%


In [21]:
model = XGBClassifier(random_state=2, n_estimators=5000)
eval_set = [(X_test, y_test)]
eval_metric="error"
model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, early_stopping_rounds=100)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

[0]	validation_0-error:0.15790
Will train until validation_0-error hasn't improved in 100 rounds.
[1]	validation_0-error:0.10526
[2]	validation_0-error:0.11842
[3]	validation_0-error:0.13158
[4]	validation_0-error:0.11842
[5]	validation_0-error:0.14474
[6]	validation_0-error:0.14474
[7]	validation_0-error:0.14474
[8]	validation_0-error:0.14474
[9]	validation_0-error:0.14474
[10]	validation_0-error:0.14474
[11]	validation_0-error:0.15790
[12]	validation_0-error:0.15790
[13]	validation_0-error:0.17105
[14]	validation_0-error:0.17105
[15]	validation_0-error:0.17105
[16]	validation_0-error:0.15790
[17]	validation_0-error:0.17105
[18]	validation_0-error:0.15790
[19]	validation_0-error:0.17105
[20]	validation_0-error:0.17105
[21]	validation_0-error:0.17105
[22]	validation_0-error:0.18421
[23]	validation_0-error:0.18421
[24]	validation_0-error:0.17105
[25]	validation_0-error:0.18421
[26]	validation_0-error:0.18421
[27]	validation_0-error:0.18421
[28]	validation_0-error:0.18421
[29]	validation

In [22]:
grid_search(params={'n_estimators':[2, 25, 50, 75, 100]})

Best params: {'n_estimators': 50}
Best score: 0.78907


In [23]:
grid_search(params={'max_depth':[1, 2, 3, 4, 6, 7, 8], 
                    'n_estimators':[50]})

Best params: {'max_depth': 1, 'n_estimators': 50}
Best score: 0.83869


In [24]:
grid_search(params={'max_depth':[1, 2, 3, 4, 6, 7, 8], 
                    'n_estimators':[2, 50, 100]})

Best params: {'max_depth': 1, 'n_estimators': 50}
Best score: 0.83869


In [35]:
grid_search(params={'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5], 
                    'max_depth':[1], 
                    'n_estimators':[50]})

Best params: {'learning_rate': 0.3, 'max_depth': 1, 'n_estimators': 50}
Best score: 0.83869


In [26]:
grid_search(params={'min_child_weight':[1, 2, 3, 4, 5], 
                    'max_depth':[1], 
                    'n_estimators':[50]})

Best params: {'max_depth': 1, 'min_child_weight': 1, 'n_estimators': 50}
Best score: 0.83869


In [27]:
grid_search(params={'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1],
                    'max_depth':[1], 
                    'n_estimators':[50]})

Best params: {'max_depth': 1, 'n_estimators': 50, 'subsample': 1}
Best score: 0.83869


In [28]:
grid_search(params={'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1], 
                    'min_child_weight':[1, 2, 3, 4, 5], 
                    'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5], 
                    'max_depth':[1, 2, 3, 4, 5], 
                    'n_estimators':[2]})

Best params: {'learning_rate': 0.5, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 2, 'subsample': 0.9}
Best score: 0.81224


<div class="alert alert-info">
<font color=black>

- Up until now we have search on a change one and keep other constant basis via grid search.
- By doing this we may have missed something. Now we'll switch to random search while allowing more than one hyperparmater to change.
- This will improve our changes to find the best option.

</font>
</div>

In [29]:
grid_search(params={'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1], 
                    'min_child_weight':[1, 2, 3, 4, 5], 
                    'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5], 
                    'max_depth':[1, 2, 3, 4, 5, None], 
                    'n_estimators':[2, 25, 50, 75, 100]}, random=True)

Best params: {'subsample': 0.6, 'n_estimators': 25, 'min_child_weight': 4, 'max_depth': 4, 'learning_rate': 0.5}
Best score: 0.82208


In [30]:
grid_search(params={'colsample_bytree':[0.5, 0.6, 0.7, 0.8, 0.9, 1],
                    'max_depth':[1], 
                    'n_estimators':[50]})

Best params: {'colsample_bytree': 1, 'max_depth': 1, 'n_estimators': 50}
Best score: 0.83869


In [31]:
grid_search(params={'colsample_bylevel':[0.5, 0.6, 0.7, 0.8, 0.9, 1],
                    'max_depth':[1], 
                    'n_estimators':[50]})

Best params: {'colsample_bylevel': 1, 'max_depth': 1, 'n_estimators': 50}
Best score: 0.83869


In [32]:
grid_search(params={'colsample_bynode':[0.5, 0.6, 0.7, 0.8, 0.9, 1], 
                    'colsample_bylevel':[0.5, 0.6, 0.7, 0.8, 0.9, 1], 
                    'colsample_bytree':[0.5, 0.6, 0.7, 0.8, 0.9, 1], 
                    'max_depth':[1], 
                    'n_estimators':[50]})

Best params: {'colsample_bylevel': 0.9, 'colsample_bynode': 0.5, 'colsample_bytree': 0.8, 'max_depth': 1, 'n_estimators': 50}
Best score: 0.84852


In [57]:
grid_search(params={'gamma':[0, 0.01, 0.05, 0.1, 0.5, 1, 2, 3], 
                    'colsample_bylevel':[0.9], 
                    'colsample_bytree':[0.8], 
                    'colsample_bynode':[0.5], 
                    'max_depth':[1], 
                    'n_estimators':[50]})

Best params: {'colsample_bylevel': 0.9, 'colsample_bynode': 0.5, 'colsample_bytree': 0.8, 'gamma': 0, 'max_depth': 1, 'n_estimators': 50}
Best score: 0.84852


# References

 <div class="alert alert-warning">
<font color=black>

- Reference: Corey Wade, Hands-On Gradient Boosting with XGBoost and scikit-learn
- https://github.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn
    
</font>
</div>