# Example of using XGBoost
* Without CV
* With CV
* With param tuning
* Accuracy is not the focus in this example 

In [2]:
import xgboost as xgb

import pandas as pd
import numpy as np

from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import optuna

In [3]:
df = pd.read_csv('../data/stackoverflow.csv', nrows=1000)
df.drop(['ID', 'Tag', 'Username'], axis=1, inplace=True)
df.head()

Unnamed: 0,Reputation,Answers,Views,Upvotes
0,3942.0,2.0,7855.0,42.0
1,26046.0,12.0,55801.0,1175.0
2,1358.0,4.0,8067.0,60.0
3,264.0,3.0,27064.0,9.0
4,4271.0,4.0,13986.0,83.0


In [4]:
df.dtypes

Reputation    float64
Answers       float64
Views         float64
Upvotes       float64
dtype: object

In [5]:
y= df['Upvotes']
X = df.drop('Upvotes', axis=1)

print(X.shape, y.shape)
X.head()

(1000, 3) (1000,)


Unnamed: 0,Reputation,Answers,Views
0,3942.0,2.0,7855.0
1,26046.0,12.0,55801.0
2,1358.0,4.0,8067.0
3,264.0,3.0,27064.0
4,4271.0,4.0,13986.0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(700, 3) (300, 3) (700,) (300,)


### Xgboost without CV

* list of xgboost objective functions: https://xgboost.readthedocs.io/en/latest/parameter.html#parameters-for-tweedie-regression-objective-reg-tweedie
  * Not all the "reg" objective function can be used in XGBRegressor...

In [48]:
model = XGBRegressor(max_depth=7, min_child_weight=1,
                  silent=1, colsample_bytree=0.5,
                  subsample=0.5,
                  objective='reg:squarederror',
                  eval_metric='rmse', learning_rate=1,
                  n_jobs=2)

In [49]:
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, eval_metric='rmse',
             gamma=0, importance_type='gain', learning_rate=1, max_delta_step=0,
             max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=2, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=1, subsample=0.5, verbosity=1)

In [51]:
y_pred = model.predict(X_test)

y_pred[:10]

array([ -15.621664,  109.5023  ,  -33.97372 ,  314.199   ,  -78.72447 ,
        234.34503 , -117.758415,  -18.977987,   35.213024,  798.4864  ],
      dtype=float32)

In [52]:
r2 = r2_score(y_true=y_test.values, y_pred=y_pred)
r2

-0.5791420477951836

### Xgboost with CV

* StratifiedKFold may not work well for continuous labels when the dataset is small, because some fold might not get enough records
* The CV used here doesn't help find optimal n_estimators like `xgb.cv` does, but returns the evaluation result in each fold. To find optimal param set, need to use param tuning seperately

In [57]:
model = XGBRegressor(max_depth=7, min_child_weight=1,
                  silent=1, colsample_bytree=0.5,
                  subsample=0.5,
                  objective='reg:squarederror',
                  eval_metric='rmse', learning_rate=1)

In [67]:
skfold = KFold(n_splits=10, random_state=10, shuffle=True)
results = cross_val_score(model, X_train, y_train, cv=skfold)

In [68]:
print(results)
print(np.mean(results))

[-1.62993391e-02 -4.28167717e-01  2.19265675e-01 -2.57747309e+01
 -9.29490899e+01 -1.24038048e-01 -3.46751945e+00 -1.28927888e+00
  4.29354009e-02 -1.07404742e+01]
-13.452739742648896


### Param tuning with Optuna

In [9]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 2, 20)
    max_depth = int(trial.suggest_float('max_depth', 1, 10, log=True))
    
    model = XGBRegressor(max_depth=max_depth, min_child_weight=1,
                  silent=1, colsample_bytree=0.5,
                  subsample=0.5, n_estimators=n_estimators,
                  objective='reg:squarederror',
                  eval_metric='rmse', learning_rate=1)
    skfold = KFold(n_splits=10, random_state=10, shuffle=True)
    
    return cross_val_score(model, X_train, y_train, cv=skfold, n_jobs=-1).mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

trial = study.best_trial

print('RMSE: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2020-12-07 22:22:10,197][0m A new study created in memory with name: no-name-ed81160e-34c9-4397-8f90-ad7d6f892242[0m
[32m[I 2020-12-07 22:22:10,289][0m Trial 0 finished with value: -4.8480136083434076 and parameters: {'n_estimators': 10, 'max_depth': 1.4540943542396396}. Best is trial 0 with value: -4.8480136083434076.[0m
[32m[I 2020-12-07 22:22:10,355][0m Trial 1 finished with value: -12.959477556151347 and parameters: {'n_estimators': 4, 'max_depth': 6.432901531352945}. Best is trial 1 with value: -12.959477556151347.[0m
[32m[I 2020-12-07 22:22:10,404][0m Trial 2 finished with value: -14.326872379952599 and parameters: {'n_estimators': 9, 'max_depth': 7.389065228912997}. Best is trial 2 with value: -14.326872379952599.[0m
[32m[I 2020-12-07 22:22:10,446][0m Trial 3 finished with value: -12.768200714525436 and parameters: {'n_estimators': 4, 'max_depth': 9.573743715135896}. Best is trial 2 with value: -14.326872379952599.[0m
[32m[I 2020-12-07 22:22:10,488][0m Tr

RMSE: -14.37613911955653
Best hyperparameters: {'n_estimators': 9, 'max_depth': 6.103028626842698}
