# XGBoost - Regression (Bike Sharing)
[ch1-gradient-boosting.ipynb](https://github.com/kyopark2014/ML-Algorithms/blob/main/xgboost/src/ch1-gradient-boosting.ipynb)

In [1]:
import pandas as pd

In [2]:
df_bikes = pd.read_csv('bike_rentals_cleaned.csv')

In [3]:
df_bikes.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,1.0,0.0,1,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,985
1,2,1.0,0.0,1,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,801
2,3,1.0,0.0,1,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,1349
3,4,1.0,0.0,1,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,1562
4,5,1.0,0.0,1,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,1600


In [4]:
df_bikes.describe()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,366.0,2.49658,0.500684,6.519836,0.028728,2.997264,0.682627,1.395349,0.495423,0.474391,0.627908,0.190411,4504.348837
std,211.165812,1.110807,0.500342,3.451913,0.167155,2.004787,0.465773,0.544894,0.183023,0.162938,0.142074,0.077462,1937.211452
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.05913,0.07907,0.0,0.022392,22.0
25%,183.5,2.0,0.0,4.0,0.0,1.0,0.0,1.0,0.337083,0.337842,0.522291,0.13495,3152.0
50%,366.0,3.0,1.0,7.0,0.0,3.0,1.0,1.0,0.498333,0.486733,0.6275,0.180971,4548.0
75%,548.5,3.0,1.0,10.0,0.0,5.0,1.0,2.0,0.655417,0.608602,0.729791,0.233206,5956.0
max,731.0,4.0,1.0,12.0,1.0,6.0,1.0,3.0,0.861667,0.840896,0.9725,0.507463,8714.0


In [5]:
df_bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   season      731 non-null    float64
 2   yr          731 non-null    float64
 3   mnth        731 non-null    int64  
 4   holiday     731 non-null    float64
 5   weekday     731 non-null    float64
 6   workingday  731 non-null    float64
 7   weathersit  731 non-null    int64  
 8   temp        731 non-null    float64
 9   atemp       731 non-null    float64
 10  hum         731 non-null    float64
 11  windspeed   731 non-null    float64
 12  cnt         731 non-null    int64  
dtypes: float64(9), int64(4)
memory usage: 74.4 KB


### 누락한값이 있는지 확인

In [6]:
df_bikes.isna().sum().sum()

0

### Define Feature and Target

In [7]:
X = df_bikes.iloc[:,:-1]
y = df_bikes.iloc[:,-1]

In [8]:
y[:5]

0     985
1     801
2    1349
3    1562
4    1600
Name: cnt, dtype: int64

### Split Train/Test dataset

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

### XGBoost Regression

In [10]:
from xgboost import XGBRegressor

xg_reg = XGBRegressor()

### Training

In [11]:
xg_reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=16,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

### Evaluation

In [12]:
y_pred = xg_reg.predict(X_test)

from sklearn.metrics import mean_squared_error
import numpy as np

# mean_squared_error
mse = mean_squared_error(y_test, y_pred)

# Root Mean Square Deviation; RMSD
rmse = np.sqrt(mse)
print("RMSE: %0.2f" % (rmse))

RMSE: 705.11


## Cross Validation

In [17]:
from sklearn.model_selection import cross_val_score

model = XGBRegressor()

scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=80)

scores

array([ -191664.33447728,   -84140.03058168,  -372711.54782705,
         -97126.78545208,   -99000.89227728,  -147258.04833604,
        -226738.29671505,  -169838.3467478 ,  -144021.04711191,
        -452346.6366126 ,  -465389.57086017,  -514448.95312456,
        -193362.84309763,  -164215.14824568,  -133099.46089379,
        -176913.64874747,  -100323.76751977,  -192440.91279976,
        -115741.49679338,  -361721.17236543,  -154535.04504625,
        -270504.21367731,  -184064.1780902 ,  -168364.96403725,
        -199572.66999494,  -819716.24425051,  -527066.24039023,
         -83480.1914151 ,  -684856.38360428,  -167908.70196648,
        -461688.22082591,  -189963.0282499 ,  -237438.02020257,
        -117862.79772054,  -216790.00621115, -1130006.89760311,
        -479252.23086689,  -120247.83582857, -1811329.05400238,
        -266310.40541816,  -247869.87651676,  -106524.68468097,
         -93602.21073276,  -359281.33822293,  -360111.69526772,
        -373550.59953799,  -628481.05834

In [18]:
rmse = np.sqrt(-scores)

print('RMSE:', np.round(rmse, 2))

print('Avg RMSE: %0.2f' % (rmse.mean()))

RMSE: [ 437.79  290.07  610.5   311.65  314.64  383.74  476.17  412.11  379.5
  672.57  682.19  717.25  439.73  405.23  364.83  420.61  316.74  438.68
  340.21  601.43  393.11  520.1   429.03  410.32  446.74  905.38  725.99
  288.93  827.56  409.77  679.48  435.85  487.28  343.31  465.61 1063.02
  692.28  346.77 1345.86  516.05  497.87  326.38  305.94  599.4   600.09
  611.19  792.77  827.61  936.43  767.34  669.97  788.56  561.67  513.93
 1107.87  697.32  823.68  442.39  335.38  787.83  580.61  797.2   441.97
  750.92  851.89  860.48  547.42  461.44 1085.84  298.38 1256.89  694.76
 1672.97 1252.5   327.35 1499.84 1356.07 1361.26 1063.34 1653.13]
Avg RMSE: 656.95
