<a href="https://colab.research.google.com/github/KenzaxTazi/Agri-Risk/blob/master/gtc_bayesian_opt_xgboost_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extreme Boosting with Bayesian Optimisation###

This script does not include the data importation and preparation sections; in order for this script to run, the following variables must already exist:

*   train_set_x_prepared
*   train_set_y
*   train_set_x_prepared
*   test_set_y

In [0]:
#install required packages
!pip3 install xgboost
!pip install bayesian-optimization



In [0]:
#import packages
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
import xgboost as xgb
from sklearn.metrics import r2_score
from bayes_opt import BayesianOptimization
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [0]:
#converting the training set into XGBoost's Dmatrix object
dtrain= xgb.DMatrix(train_set_x_prepared,label=train_set_y)

#define bayesian optimization function for xgboost
#specify the parameters you want to tune as keyword arguments
def bo_tune_xgb(max_depth, 
                min_child_weight,
                gamma, 
                subsample,
                n_estimators, 
                colsample_bytree,
                ):

  params = {'max_depth': int(max_depth),
            'min_child_weight': int(min_child_weight),
            'gamma': gamma,
            'n_estimators': int(n_estimators),
            'colsample_bytree': colsample_bytree,
            #'learning_rate': learning_rate,
            'subsample': subsample, # 0.8
            'eta': 0.5,
            'eval_metric': 'rmse',
            'tree_method': 'hist',
            }

  #cross validating with the specified parameters   
  cv_result = xgb.cv(params, dtrain, num_boost_round=3, nfold=3, shuffle=False) #, verbose_eval=3)
  
  #return the negative RMSE
  return  -1.0 * cv_result['test-rmse-mean'].iloc[-1]

#Invoking the Bayesian Optimiser with the specified parameters to tune
xgb_bo = BayesianOptimization(bo_tune_xgb,
                              {'max_depth': (10, 30), # 20 to 30
                               'min_child_weight': (10, 30),
                               'gamma': (0.5, 0.8), #0.5 to 0.8
                               'colsample_bytree': (0.7, 0.7), #0.7 to 1
                               #'learning_rate': (0, 1),
                               'subsample': (0.8, 0.8), #0.7 to 0.9
                               'n_estimators': (100, 140) #100 to 140
                              })

#Performing Bayesian optimization for N iterations ('n_iter') with M steps of random exploration ('init_points') 
#with and acquisition function of expected improvement 
xgb_bo.maximize(n_iter=5, init_points=5, acq='ei')

In [0]:
#extracting the best parameters
params = xgb_bo.max['params']

#Converting the integer-valued parameters (e.g. max_depth and n_estimator) from float to int
params['max_depth']= int(params['max_depth'])
params['n_estimators']= int(params['n_estimators'])
params['min_child_weight']= int(params['min_child_weight'])
print(params)

#Initialize and XGBRegressor with the tuned parameters and fit the training data
from xgboost import XGBRegressor
xgb_regressor = xgb.XGBRegressor(**params, tree_method='hist', objective="reg:squarederror").fit(train_set_x_prepared,train_set_y)

#predict for training set
train_p = xgb_regressor.predict(train_set_x_prepared)

#predict for test set
test_p = xgb_regressor.predict(test_set_x_prepared)

print("r2 score for train set: ", r2_score(train_p,train_set_y))
print("r2 score for test set: ", r2_score(test_p,test_set_y))