In [11]:
import pandas as pd
import numpy as np
import xgboost as xgb

from matplotlib import pyplot

from xgboost import XGBClassifier
from xgboost import plot_tree
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error

In [12]:
data_frame=pd.read_csv('D:/xgboost_cancer_classifier/data.csv',sep=',',index_col=False,header=0)
data_frame=data_frame.replace({'diagnosis':{'M':1,'B':0}})
data_frame["diagnosis"]=data_frame['diagnosis'].astype('int32')

In [13]:
features_DF=pd.DataFrame(data_frame,columns=['radius_mean',
                                             'texture_mean','perimeter_mean',
                                             'area_mean','smoothness_mean',
                                             'compactness_mean','concavity_mean',
                                             'concave points_mean','symmetry_mean',
                                             'fractal_dimension_mean','radius_se',
                                             'texture_se','perimeter_se','area_se',
                                             'smoothness_se','compactness_se',
                                             'concavity_se','concave points_se',
                                             'symmetry_se','fractal_dimension_se',
                                             'radius_worst','texture_worst',
                                             'perimeter_worst','area_worst',
                                             'smoothness_worst','compactness_worst',
                                             'concavity_worst','concave points_worst',
                                             'symmetry_worst','fractal_dimension_worst'])

label_DF=pd.DataFrame(data_frame,columns=['diagnosis'])

features_DF=features_DF.to_numpy()
label_DF=label_DF.to_numpy()

In [14]:
(X_train,X_test,Y_train,Y_test) = train_test_split(features_DF,label_DF, test_size=.10,random_state=7)

Training the model with default hyper parameters:

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [15]:
#Initialize an XGBClassifier with the tuned parameters and fit the training data
#predicting for training set

model=XGBClassifier()
model.fit(X_train,Y_train)
predictions=model.predict(X_test)
accuracy=accuracy_score(Y_test,predictions)
print(model)
print("Accuracy: %.2f%%" % (accuracy*100))



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
Accuracy: 98.25%


  return f(*args, **kwargs)


now with bayesian optimization methods to find best model hyperparameters

In [16]:
d_matrix = xgb.DMatrix(features_DF, label_DF)


According to documentation, there is a tradeoff between number of learners-
and learning rate. Large numbers of trees + small learning rate does not-
provide optimal results. Learning rate between 0.1-0.125 and between 100 - 500-
number of trees.

When finding optimal hyperparameters using Bayesian optimization, is important-
to know the boundaries of the hyperparameters to enclose our function to optimize.


Bibliography on hyperparameter tuning:

https://www.youtube.com/watch?v=wPqtzj5VZus

www.saedsayad.com/docs/gbm2.pdf



In [17]:
def xgboost_optimized(max_depth,gamma,learning_rate,n_estimators,subsample,colsample_bytree):

    params={'max_depth':int(max_depth),'gamma':gamma,
            'n_estimators':int(n_estimators),
            'learning_rate':learning_rate,
            'subsample':subsample,'colsample_bytree':colsample_bytree,
            'eval_metric':'rmse'}

    cv_result=xgb.cv(params,d_matrix,num_boost_round=700,nfold=5)

    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [18]:
xgb_bo = BayesianOptimization(xgboost_optimized, {'max_depth': (3,8),
                                             'gamma': (0,1),
                                             'learning_rate':(0.095,0.125),
                                             'n_estimators':(100,500),
                                             'subsample':(0.4,0.6),
                                             'colsample_bytree':(0.4,0.6),
                                            })

In [19]:
xgb.set_config(verbosity=0)
optimal_network=xgb_bo.maximize(n_iter=6, init_points=8, acq='ei')


|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1763  [0m | [0m 0.5106  [0m | [0m 0.2006  [0m | [0m 0.0967  [0m | [0m 4.073   [0m | [0m 362.1   [0m | [0m 0.4019  [0m |
| [0m 2       [0m | [0m-0.1915  [0m | [0m 0.403   [0m | [0m 0.8207  [0m | [0m 0.1241  [0m | [0m 5.204   [0m | [0m 203.9   [0m | [0m 0.5519  [0m |
| [0m 3       [0m | [0m-0.1839  [0m | [0m 0.4349  [0m | [0m 0.4258  [0m | [0m 0.1071  [0m | [0m 3.489   [0m | [0m 428.5   [0m | [0m 0.5342  [0m |
| [95m 4       [0m | [95m-0.1713  [0m | [95m 0.4623  [0m | [95m 0.1426  [0m | [95m 0.1196  [0m | [95m 7.567   [0m | [95m 410.6   [0m | [95m 0.548   [0m |
| [0m 5       [0m | [0m-0.1905  [0m | [0m 0.5561  [0m | [0m 0.7953  [0m | [0m 0.1162  [0m | [0m 7.782   [0m | [0m 442.5   [0m | [0m 0

In [20]:
#Extracting the best parameters
params = xgb_bo.max['params']
print(params)

#Converting the max_depth and n_estimator values from float to int
params['max_depth']= int(params['max_depth'])
params['n_estimators']= int(params['n_estimators'])

#Initialize an XGBClassifier with the tuned parameters and fit the training data

model_2=XGBClassifier(**params)
model_2.fit(X_train,Y_train)

#predicting for training set
predictions=model.predict(X_test)
accuracy_2=accuracy_score(Y_test,predictions)
print(model_2)
print("Accuracy: %.2f%%" % (accuracy_2*100))


{'colsample_bytree': 0.5100677145591068, 'gamma': 0.040679526310111735, 'learning_rate': 0.119577660717184, 'max_depth': 5.62551361134971, 'n_estimators': 146.5094981490878, 'subsample': 0.5678950830021507}
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5100677145591068,
              gamma=0.040679526310111735, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.119577660717184,
              max_delta_step=0, max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=146, n_jobs=12,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.5678950830021507,
              tree_method='exact', validate_parameters=1, verbosity=None)
Accuracy: 98.25%


  return f(*args, **kwargs)


Dataset by:
 https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29