In [4]:
import pandas as pd
import numpy as np
import time

from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV as HRSCV
from xgboost import XGBRegressor

from scipy.stats import norm

from project_module import regression_report
from project_module.feature_selection import SelectKBestByCoefficient

In [5]:
# load data
x_train = np.load('x_train.npy')
y_train = np.load('y_train.npy')
x_test = np.load('x_test.npy')
y_test = np.load('y_test.npy')

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(1095, 89) (1095,) (365, 89) (365,)


## 解題步驟：

1. 讀取 x_train.npy, y_train.npy, x_test.npy, y_test.npy
2. 先以上課的知識或 default hyperparameter 調整出一個不會 over-fitting 太多的 XGBoost 模型
3. 以該組超參數為基準，搜尋附近的參數(可以用自己偏好的搜尋策略)
4. 將最終調整結果與一開始的模型做比較，誤差是否有降低
5. 請比較 Random Forest, XGBoost(有時間的同學可以增加 GBDT, Adaboost) 的超參數搜尋時間與誤差(記得要控制 n_estimators 等會影響到時間的參數，使其叫)

In [12]:
def get_XGBR(params: dict) -> XGBRegressor:
    XGBR = XGBRegressor(**params)
    return XGBR

best_params = {
    'eta':0.3, 'gamma':0, 'max_depth':6, 'ambda':1, 'alpha':0,
    'max_depth':6, 'min_child_weight':4, 'subsample':1,
    'objective':'reg:squarederror', 'scale_pos_weight':1,
    'colsample_bytree':1, 'colsample_bylevel':1, 'colsample_bynode':1,
}

XGBR = get_XGBR(best_params)
XGBR.fit(x_train, y_train)

pred = XGBR.predict(x_test)
regression_report(y_test, pred)

Parameters: { "ambda" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


mse = 1031206072.6127
mae = 17231.6043
rmse = 32112.3975
mape = 0.0954


## [XGBoost 官方文檔](https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn)

## 計算 XGBoost 超參數搜尋時間

In [5]:
""" Your code here: 搜尋超參數，並計算搜尋時間 """

HRSCV ended in 122.0842s

Best score is 0.1006
Best params is {'subsample': 0.76, 'objective': 'reg:squarederror', 'n_estimators': 192, 'min_child_weight': 0.6370665120808605, 'max_depth': 5, 'lambda': 0.56, 'eta': 0.02573989947635369, 'alpha': 0.41000000000000003}


## 計算 XGBoost 單輪訓練時間

In [6]:
""" Your code here: 使用搜尋到的參數分析在 testing data 上的誤差表現，並計算時間 """

XGBoost was trained in 0.2171s

mse = 1010869446.2126
mae = 15842.4891
rmse = 31794.1731
mape = 0.0904


## 計算 Random Forest 超參數搜尋時間

In [None]:
""" Your code here: 搜尋 Random Forest 超參數，並計算搜尋時間 """

## 計算 Random Forest 單輪訓練時間

In [8]:
""" Your code here: 使用搜尋到的參數分析在 testing data 上的誤差表現，並計算時間 """

Random Forest was trained in 0.1777s

mse = 1192438196.9586
mae = 18699.1861
rmse = 34531.6984
mape = 0.1060
