# XGBoost - Regression (Bike Sharing)
[ch1-gradient-boosting.ipynb](https://github.com/kyopark2014/ML-Algorithms/blob/main/xgboost/src/ch1-gradient-boosting.ipynb)

In [1]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
import time

In [4]:
import xgboost as xgb
xgb.set_config(verbosity=0)

In [5]:
df_bikes = pd.read_csv('bike_rentals_cleaned.csv')

In [6]:
df_bikes.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,1.0,0.0,1,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,985
1,2,1.0,0.0,1,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,801
2,3,1.0,0.0,1,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,1349
3,4,1.0,0.0,1,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,1562
4,5,1.0,0.0,1,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,1600


In [7]:
df_bikes.describe()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,366.0,2.49658,0.500684,6.519836,0.028728,2.997264,0.682627,1.395349,0.495423,0.474391,0.627908,0.190411,4504.348837
std,211.165812,1.110807,0.500342,3.451913,0.167155,2.004787,0.465773,0.544894,0.183023,0.162938,0.142074,0.077462,1937.211452
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.05913,0.07907,0.0,0.022392,22.0
25%,183.5,2.0,0.0,4.0,0.0,1.0,0.0,1.0,0.337083,0.337842,0.522291,0.13495,3152.0
50%,366.0,3.0,1.0,7.0,0.0,3.0,1.0,1.0,0.498333,0.486733,0.6275,0.180971,4548.0
75%,548.5,3.0,1.0,10.0,0.0,5.0,1.0,2.0,0.655417,0.608602,0.729791,0.233206,5956.0
max,731.0,4.0,1.0,12.0,1.0,6.0,1.0,3.0,0.861667,0.840896,0.9725,0.507463,8714.0


In [8]:
df_bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   season      731 non-null    float64
 2   yr          731 non-null    float64
 3   mnth        731 non-null    int64  
 4   holiday     731 non-null    float64
 5   weekday     731 non-null    float64
 6   workingday  731 non-null    float64
 7   weathersit  731 non-null    int64  
 8   temp        731 non-null    float64
 9   atemp       731 non-null    float64
 10  hum         731 non-null    float64
 11  windspeed   731 non-null    float64
 12  cnt         731 non-null    int64  
dtypes: float64(9), int64(4)
memory usage: 74.4 KB


### 누락한값이 있는지 확인

In [9]:
df_bikes.isna().sum().sum()

0

## Define Feature and Target

In [10]:
X = df_bikes.iloc[:,:-1]
y = df_bikes.iloc[:,-1]

In [11]:
pd.DataFrame(y).head()

Unnamed: 0,cnt
0,985
1,801
2,1349
3,1562
4,1600


## Regression Model Selection

In [12]:
from xgboost import XGBRegressor
from sklearn.model_selection import StratifiedKFold, cross_val_score

# kfold = KFold(n_splits=5, shuffle=True, random_state=2)
kfold = StratifiedKFold(n_splits=2)

def cross_validation(model):
    start = time.time()
    
    # scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kfold)
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=10)

    rmse = np.sqrt(-scores)
    
    print('Cross Validation:')
    print('Elased time: %0.2fs' % (time.time()-start))
    print('RMSE:', np.round(rmse, 3))
    print('Avg. RMSE: %0.3f' % (rmse.mean()))

In [13]:
cross_validation(XGBRegressor(booster='gbtree'))

Cross Validation:
Elased time: 3.28s
RMSE: [ 717.646  692.804  520.7    737.676  835.961 1006.237  991.342  747.612
  891.994 1731.128]
Avg. RMSE: 887.310


In [14]:
cross_validation(XGBRegressor(booster='gblinear'))

Cross Validation:
Elased time: 2.48s
RMSE: [5.82131384e+29 1.59106597e+23 2.14596023e+30 1.44551212e+30
 3.04276282e+30 1.04623890e+30 3.96962689e+29 1.87064239e+29
 9.19169832e+28 7.08440508e+29]
Avg. RMSE: 964699002954221618415123038208.000


In [15]:
cross_validation(XGBRegressor(booster='dart', one_drop=1))

Cross Validation:
Elased time: 6.15s
RMSE: [ 574.362  589.35   523.466  718.081  833.296 1075.336  963.095  754.554
  843.255 1762.242]
Avg. RMSE: 863.704


In [16]:
cross_validation(XGBRegressor(booster='dart', rate_drop=0.5))

Cross Validation:
Elased time: 4.41s
RMSE: [ 691.328  879.621  602.354  827.953  852.068 1251.988 1009.174  828.154
 1305.736 1620.033]
Avg. RMSE: 986.841


In [17]:
cross_validation(XGBRegressor(booster='dart', rate_drop=0.5, sample_type='weighted'))

Cross Validation:
Elased time: 4.49s
RMSE: [ 712.683  911.26   596.958  880.983  909.862 1305.965  986.295  821.951
 1317.766 1780.286]
Avg. RMSE: 1022.401


In [18]:
cross_validation(XGBRegressor(booster='dart', rate_drop=0.5, normalize_type='forest'))

Cross Validation:
Elased time: 4.90s
RMSE: [ 690.425  820.184  560.393  863.211  882.996 1244.912 1086.051  871.756
 1178.115 1799.3  ]
Avg. RMSE: 999.734


In [19]:
from sklearn.linear_model import LinearRegression, LogisticRegression

cross_validation(LinearRegression())

Cross Validation:
Elased time: 0.04s
RMSE: [ 504.007  840.55  1140.882  728.392  640.197  969.949 1133.448 1252.847
 1084.636 1425.326]
Avg. RMSE: 972.023


In [20]:
from sklearn.linear_model import Lasso

cross_validation(Lasso())

Cross Validation:
Elased time: 0.09s
RMSE: [ 491.917  830.768 1133.671  731.759  632.751  963.02  1139.191 1244.107
 1065.067 1463.133]
Avg. RMSE: 969.538


In [21]:
from sklearn.linear_model import Ridge

cross_validation(Ridge())

Cross Validation:
Elased time: 0.08s
RMSE: [ 487.127  824.751 1098.441  735.696  632.238  969.021 1146.207 1226.545
 1036.396 1491.762]
Avg. RMSE: 964.818


In [22]:
from sklearn.ensemble import RandomForestRegressor

cross_validation(RandomForestRegressor())

Cross Validation:
Elased time: 2.97s
RMSE: [ 793.212  519.266  532.952  801.346  852.819  809.644  872.602  793.877
  767.506 2282.163]
Avg. RMSE: 902.539


## HPO

### Default

In [23]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split train/test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

# define model
#model = XGBRegressor(booster='gbtree', objective='binary:logistic', 
#                        random_state=2, verbosity=0, use_label_encoder=False, n_jobs=-1)
model = XGBRegressor(booster='gbtree', random_state=2, verbosity=0, use_label_encoder=False, n_jobs=-1)

# default model accuacry 
def getAccuracy(model):
    start = time.time()

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    reg_mse = mean_squared_error(y_test, y_pred)
    reg_rmse = np.sqrt(reg_mse)

    print('Elapsed time: %0.2fs' % (time.time()-start))        
    print('RMSE: %0.3f' % (reg_rmse))

getAccuracy(model)

Elapsed time: 0.28s
RMSE: 705.114


In [None]:
# n_estimators
param = 'n_estimators'
values = [50, 100, 200, 400, 800]
best_param = 0
best_score = 1e9
cnt = 0
start = time.time()
print(param, '=', params.get(param))

for value in values:
    start = time.time()
    
    xgb = XGBRegressor(booster='gbtree', n_estimators=value, 
                       random_state=2, verbosity=3, use_label_encoder=False, n_jobs=-1)
    
    xgb.fit(X_train, y_train)
    
    y_pred = xgb.predict(X_test)
    
    reg_mse = mean_squared_error(y_test, y_pred)
    reg_rmse = np.sqrt(reg_mse)
    
    if best_score > reg_rmse:
        best_score = reg_rmse
        best_param = cnt

    print('Elapsed time: %0.2fs' % (time.time()-start))    
    print('RMSE: %0.3f' % (reg_rmse))
    cnt = cnt + 1

print('\nElased time: %0.2fs' % (time.time()-start))    
print('best score: %0.2f' % (best_score))
print('best param: ', values[best_param])

## HPO: Randomized Search

In [24]:
from sklearn.model_selection import RandomizedSearchCV

def randomized_search(params, runs=20): 
    xgb = XGBRegressor(booster='gbtree', random_state=2, verbosity=3, use_label_encoder=False, n_jobs=-1)
    
    # kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)    
    # rand_reg = RandomizedSearchCV(xgb, params, cv=kfold, n_iter=runs, n_jobs=-1, random_state=2, scoring='neg_mean_squared_error')    
    rand_reg = RandomizedSearchCV(xgb, params, cv=10, n_iter=runs, n_jobs=-1, random_state=2, scoring='neg_mean_squared_error')
    
    rand_reg.fit(X_train, y_train)    
    
    best_model = rand_reg.best_estimator_    
    
    best_params = rand_reg.best_params_
    print("best parameter:", best_params)
    
    best_score = rand_reg.best_score_
    print("best score: {:.3f}".format(best_score))
    
    return best_model

In [None]:
#        'n_estimators':[50, 100, 200],
#        'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
#        'max_depth':[1, 2, 3, 5, 6, 8],
#        'gamma':[0, 0.01, 0.1, 0.5, 1, 2],
#        'min_child_weight':[1, 2, 3, 4, 5],
#        'subsample':[0.5, 0.7, 0.8, 0.9, 1],
#        'colsample_bytree':[0.5, 0.7, 0.8, 0.9, 1],  

start = time.time()

best_model = randomized_search(
    params={
        'n_estimators':[50, 100, 200],
        'learning_rate':[0.01, 0.1, 0.3, 0.5],
        'max_depth':[1, 2, 3, 5, 7, 9],
        'subsample':[0.5, 0.7, 0.8, 0.9, 1],
        }, 
    runs=20)

print('\nElapsed time: %0.2fs' % (time.time()-start))

## Evaluation

In [None]:
model = best_model

def model_estimation(model):    
    start = time.time()
    
    y_pred = model.predict(X_test)

    reg_mse = mean_squared_error(y_test, y_pred)
    reg_rmse = np.sqrt(reg_mse)

    print('Elapsed time: %0.2fs' % (time.time()-start))    
    
    #print('RMSE:', np.round(rmse, 3))
    print('RMSE: %0.3f' % (reg_rmse))
    
cross_validation(model)   
model_estimation(model)    

### HPO

In [None]:
from xgboost import XGBRegressor

xg_reg = XGBRegressor(max_depth=3, n_estimators=1600, eta=0.02, subsample=0.75, random_state=2)

xg_reg.fit(X_train, y_train)

y_pred = xg_reg.predict(X_test)

rmse = mean_squared_error(y_test, y_pred)**0.5

print("RMSE: %0.2f" % (rmse))