## **Stacking 조합을 위한 개별 알고리즘 모델링**

### 1) Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr=LinearRegression()
lr.fit(X,y)
y_pred_log=lr.predict(test)
y_pred=np.expm1(y_pred_log)

In [None]:
score = cv_rmse(lr, X, y, kfolds)
print("Linear Regression: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Linear Regression: 2.1233 (0.0849)


### 2) Ridge

In [None]:
from sklearn.linear_model import RidgeCV

In [None]:
alphas_ridge = [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 20, 50, 100, 200, 500, 1000]

ridge=RidgeCV(alphas=alphas_ridge, cv=kfolds)
ridge.fit(X, y)

y_pred_log = ridge.predict(test)
y_pred=np.expm1(y_pred_log)

In [None]:
score = cv_rmse(ridge, X, y, kfolds)
print("Ridge: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Ridge: 2.1066 (0.0808)


### 3) Lasso

In [None]:
from sklearn.linear_model import LassoCV

In [None]:
alphas_lasso = [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1]

lasso = LassoCV(alphas=alphas_lasso, cv=kfolds, random_state=156)
lasso.fit(X, y)

y_pred_log = lasso.predict(test)
y_pred=np.expm1(y_pred_log)

In [None]:
score = cv_rmse(lasso, X, y, kfolds)
print("Lasso: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Lasso: 2.1160 (0.0679)


### 4) ElasticNet

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
alphas_elasticnet = [0.0001, 0.001, 0.01, 0.1, 1.0]
l1_ratios = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]  # 1.0은 Lasso, 0.0은 Ridge

elasticnet = ElasticNetCV(alphas=alphas_elasticnet,
                          l1_ratio=l1_ratios,
                          cv=kfolds,
                          max_iter=10000,
                          random_state=156)
elasticnet.fit(X, y)

y_pred_log = elasticnet.predict(test)
y_pred=np.expm1(y_pred_log)

In [None]:
score = cv_rmse(elasticnet, X, y, kfolds)
print("elasticnet: {:.4f} ({:.4f})".format(score.mean(), score.std()))

elasticnet: 2.1097 (0.0795)


### 5) XGBoost

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
xgb = XGBRegressor(random_state=156)

param_grid_xgb = {
     'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'num_leaves': [20, 30, 40, 50, 60],
    'subsample': [0.5, 0.7, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0]
}

grid_search_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid_xgb,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search_xgb.fit(X, y)

xgb_best_params = grid_search_xgb.best_params_

print("XGBoost 최적 하이퍼파라미터:", xgb_best_params)

Fitting 5 folds for each of 2160 candidates, totalling 10800 fits
XGBoost 최적 하이퍼파라미터: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500, 'num_leaves': 20, 'subsample': 0.5}


In [None]:
xgb = XGBRegressor(**xgb_best_params, random_state=156)
xgb.fit(X, y)

y_pred_log = xgb.predict(test)
y_pred=np.expm1(y_pred_log)

In [None]:
score = cv_rmse(xgb, X, y, kfolds)
print("XGBoost: {:.4f} ({:.4f})".format(score.mean(), score.std()))

XGBoost: 1.8188 (0.0725)


### 6) LightGBM

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid_lgbm = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'num_leaves': [20, 30, 40, 50, 60],
    'subsample': [0.5, 0.7, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0]
}

grid_search_lgbm = GridSearchCV(
    estimator=LGBMRegressor(random_state=156),
    param_grid=param_grid_lgbm,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search_lgbm.fit(X, y)

lgbm_best_params = grid_search_lgbm.best_params_

print("LightGBM 최적 하이퍼파라미터:", lgbm_best_params)

Fitting 5 folds for each of 2160 candidates, totalling 10800 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 332
[LightGBM] [Info] Number of data points in the train set: 600, number of used features: 36
[LightGBM] [Info] Start training from score 9.951971
LightGBM 최적 하이퍼파라미터: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'num_leaves': 20, 'subsample': 0.5}


In [None]:
lgbm = LGBMRegressor(**lgbm_best_params, random_state=156)
lgbm.fit(X, y)

y_pred_log = lgbm.predict(test)
y_pred=np.expm1(y_pred_log)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 332
[LightGBM] [Info] Number of data points in the train set: 600, number of used features: 36
[LightGBM] [Info] Start training from score 9.951971


In [None]:
score = cv_rmse(lgbm, X, y, kfolds)
print("LightGBM: {:.4f} ({:.4f})".format(score.mean(), score.std()))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 299
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 35
[LightGBM] [Info] Start training from score 9.941734
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 294
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 35
[LightGBM] [Info] Start training from score 9.937776
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000184 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 289
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 35
[LightGBM] [Info] Start training fro

### 7) Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid_rf = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search_rf = GridSearchCV(
    estimator=RandomForestRegressor(random_state=156),
    param_grid=param_grid_rf,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search_rf.fit(X, y)

rf_best_params = grid_search_rf.best_params_

print("Random Forest 최적 하이퍼파라미터:", rf_best_params)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Random Forest 최적 하이퍼파라미터: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}


In [None]:
rf = RandomForestRegressor(**rf_best_params, random_state=156)
rf.fit(X, y)

y_pred_log = rf.predict(test)
y_pred=np.expm1(y_pred_log)

In [None]:
score = cv_rmse(rf, X, y, kfolds)
print("Random Forest: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Random Forest: 1.8249 (0.0937)


## **Stacking**

### **선형 모델과 비선형 모델의 조합 스태킹 앙상블**

추가 스태킹 모델링에서는 선형 모델 1개와 비선형 모델 1개 이상의 조합을 고려하였다.

- 선형 모델
  - 단일 모델로 진행한 예측 결과를 바탕으로, 선형 모델인 Linear Regression, Ridge, Lasso, Elastic Net 중 RMSE가 가장 낮게 나타난 Ridge와, 그 다음으로 낮았던 Elastic Net을 선택하였다.

- 비선형 모델
  - XGBoost, LightGBM, Random Forest 중 RMSE가 가장 낮은 XGBoost와 두 번째로 낮은 Random Forest를 비선형 base 모델로 선정하였다.

최종 메타 모델로는 가장 낮은 RMSE를 기록한 XGBoost를 선택하였으며, 동일한 base 모델 조합을 사용한 상태에서 메타 모델만 Random Forest로 변경한 구성도 추가로 실험하였다.

 [ 모델링 구성 ]

1. Ridge + XGBoost + Random Forest → 메타 모델: XGBoost

2. Elastic Net + XGBoost + Random Forest → 메타 모델: XGBoost

3. Ridge + XGBoost + Random Forest → 메타 모델: Random Forest
  - 1번과 base 모델 구성은 동일하며, 메타 모델만 Random Forest로 변경

##### Ridge + XGBoost + Random Forest -> 최종 메타 모델: XGBoost

In [None]:
def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    print(model.__class__.__name__, 'model 시작')

    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        print(f'\t폴드 세트: {folder_counter} 시작')
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_val = X_train_n[valid_index]

        model.fit(X_tr, y_tr)
        train_fold_pred[valid_index, :] = model.predict(X_val).reshape(-1, 1)
        test_pred[:, folder_counter] = model.predict(X_test_n)

    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)
    return train_fold_pred, test_pred_mean

최종 메타 모델 XGBoost 하이퍼파라미터

In [None]:
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

In [None]:
search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1000, 50)),
    'max_depth': scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0)
}


def objective(params):
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])

    model = XGBRegressor(
        **params,
        random_state=156,
        n_jobs=-1
    )

    score = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5).mean()
    return {'loss': -score, 'status': STATUS_OK}

trials = Trials()

best = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(156)
)

print("Best Hyperparameters:", best)

100%|██████████| 50/50 [02:15<00:00,  2.70s/trial, best loss: 3.15150139605864]
Best Hyperparameters: {'colsample_bytree': np.float64(0.697894851629624), 'learning_rate': np.float64(0.018022488632362756), 'max_depth': np.float64(4.0), 'n_estimators': np.float64(450.0), 'subsample': np.float64(0.8888649023879256)}


In [None]:
# Base models
ridge=RidgeCV(alphas=alphas_ridge, cv=kfolds)
xgb = XGBRegressor(**xgb_best_params)
rf = RandomForestRegressor(**rf_best_params)

base_models = [ridge, xgb, rf]

X_np = X.values
y_np = y.values
test_np = test.values

ridge_train,ridge_test=get_stacking_base_datasets(ridge, X_np, y_np, test_np, 5)
xgb_train,xgb_test=get_stacking_base_datasets(xgb, X_np, y_np, test_np, 5)
rf_train,rf_test=get_stacking_base_datasets(rf, X_np, y_np, test_np, 5)

Stack_final_X_train=np.concatenate((ridge_train, xgb_train, rf_train),axis=1)
Stack_final_X_test=np.concatenate((ridge_test, xgb_test, rf_test),axis=1)

meta_model=XGBRegressor(
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    learning_rate=best['learning_rate'],
    subsample=best['subsample'],
    colsample_bytree=best['colsample_bytree'],
    random_state=156)

meta_model.fit(Stack_final_X_train,y_np)
Stack_final_log=meta_model.predict(Stack_final_X_test)
Stack_final=np.expm1(Stack_final_log)

RidgeCV model 시작
	폴드 세트: 0 시작
	폴드 세트: 1 시작
	폴드 세트: 2 시작
	폴드 세트: 3 시작
	폴드 세트: 4 시작
XGBRegressor model 시작
	폴드 세트: 0 시작
	폴드 세트: 1 시작
	폴드 세트: 2 시작
	폴드 세트: 3 시작
	폴드 세트: 4 시작
RandomForestRegressor model 시작
	폴드 세트: 0 시작
	폴드 세트: 1 시작
	폴드 세트: 2 시작
	폴드 세트: 3 시작
	폴드 세트: 4 시작


In [None]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y, meta_model.predict(Stack_final_X_train)))
print(f'OOF RMSE: {rmse:.4f}')

OOF RMSE: 1.2564


In [None]:
submission_ = submission.copy()
submission_['box_off_num'] = Stack_final
submission_.to_csv("stacking.csv", index=False)

점수: 1551493.0510696012

##### ElasticNet + XGBoost + RandomForest -> 최종 메타 모델: XGBoost

In [None]:
# Base models
elasticnet = ElasticNetCV(alphas=alphas_elasticnet,
                          l1_ratio=l1_ratios,
                          cv=kfolds,
                          max_iter=10000,
                          random_state=156)
xgb = XGBRegressor(**xgb_best_params)
rf = RandomForestRegressor(**rf_best_params)

base_models = [elasticnet, xgb, rf]

X_np = X.values
y_np = y.values
test_np = test.values

elasticnet_train,elasticnet_test=get_stacking_base_datasets(elasticnet, X_np, y_np, test_np, 5)
xgb_train,xgb_test=get_stacking_base_datasets(xgb, X_np, y_np, test_np, 5)
rf_train,rf_test=get_stacking_base_datasets(rf, X_np, y_np, test_np, 5)

Stack_final_X_train=np.concatenate((elasticnet_train, xgb_train, rf_train),axis=1)
Stack_final_X_test=np.concatenate((elasticnet_test, xgb_test, rf_test),axis=1)

meta_model=XGBRegressor(
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    learning_rate=best['learning_rate'],
    subsample=best['subsample'],
    colsample_bytree=best['colsample_bytree'],
    random_state=156)

meta_model.fit(Stack_final_X_train,y_np)
Stack_final_log=meta_model.predict(Stack_final_X_test)
Stack_final=np.expm1(Stack_final_log)

ElasticNetCV model 시작
	폴드 세트: 0 시작
	폴드 세트: 1 시작
	폴드 세트: 2 시작
	폴드 세트: 3 시작
	폴드 세트: 4 시작
XGBRegressor model 시작
	폴드 세트: 0 시작
	폴드 세트: 1 시작
	폴드 세트: 2 시작
	폴드 세트: 3 시작
	폴드 세트: 4 시작
RandomForestRegressor model 시작
	폴드 세트: 0 시작
	폴드 세트: 1 시작
	폴드 세트: 2 시작
	폴드 세트: 3 시작
	폴드 세트: 4 시작


In [None]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y, meta_model.predict(Stack_final_X_train)))
print(f'OOF RMSE: {rmse:.4f}')

OOF RMSE: 1.2579


In [None]:
submission_ = submission.copy()
submission_['box_off_num'] = Stack_final
submission_.to_csv("stacking1.csv", index=False)

리더보드 점수: 1608933.116872492

선형 모델로 Ridge를 추가한 스태킹 모델보다 ElasticNet을 추가한 스태킹 모델의 점수가 더 높게 나왔다.

추가적으로 선형 모델로 Ridge를 이용한 모델에서 최종 메타 모델만 RMSE가 2번째로 낮은 값을 보인 Random Forest로 진행

##### Ridge + XGBoost + Random Forest -> 최종 메타 모델: Random Forest

최종 메타 모델 RandomForest 하이퍼파라미터 튜닝

In [None]:
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

In [None]:
search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1000, 50)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 30, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 20, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 20, 1))
}


def objective(space):
    rf_clf = RandomForestRegressor(
        n_estimators=int(space['n_estimators']),
        max_depth=int(space['max_depth']),
        min_samples_split=int(space['min_samples_split']),
        random_state=156
    )
    mse = cross_val_score(rf_clf, X, y, cv=3, scoring="neg_mean_squared_error")
    return {'loss': -1 * np.mean(mse), 'status': STATUS_OK}

trials = Trials()

best = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(156)
)

print("Best Hyperparameters:", best)

100%|██████████| 50/50 [05:30<00:00,  6.61s/trial, best loss: 3.3772198865067167]
Best Hyperparameters: {'max_depth': np.float64(29.0), 'min_samples_leaf': np.float64(7.0), 'min_samples_split': np.float64(2.0), 'n_estimators': np.float64(1000.0)}


In [None]:
# Base models
ridge=RidgeCV(alphas=alphas_ridge, cv=kfolds)
xgb = XGBRegressor(**xgb_best_params, random_state=156)
rf = RandomForestRegressor(**rf_best_params, random_state=156)

base_models = [ridge, xgb, rf]

X_np = X.values
y_np = y.values
test_np = test.values

ridge_train,ridge_test=get_stacking_base_datasets(ridge, X_np, y_np, test_np, 5)
xgb_train,xgb_test=get_stacking_base_datasets(xgb, X_np, y_np, test_np, 5)
rf_train,rf_test=get_stacking_base_datasets(rf, X_np, y_np, test_np, 5)

Stack_final_X_train=np.concatenate((ridge_train, xgb_train, rf_train),axis=1)
Stack_final_X_test=np.concatenate((ridge_test, xgb_test, rf_test),axis=1)

meta_model=RandomForestRegressor(
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_split=int(best['min_samples_split']),
    random_state=156
)
meta_model.fit(Stack_final_X_train,y_np)
Stack_final_log=meta_model.predict(Stack_final_X_test)
Stack_final=np.expm1(Stack_final_log)

RidgeCV model 시작
	폴드 세트: 0 시작
	폴드 세트: 1 시작
	폴드 세트: 2 시작
	폴드 세트: 3 시작
	폴드 세트: 4 시작
XGBRegressor model 시작
	폴드 세트: 0 시작
	폴드 세트: 1 시작
	폴드 세트: 2 시작
	폴드 세트: 3 시작
	폴드 세트: 4 시작
RandomForestRegressor model 시작
	폴드 세트: 0 시작
	폴드 세트: 1 시작
	폴드 세트: 2 시작
	폴드 세트: 3 시작
	폴드 세트: 4 시작


In [None]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y, meta_model.predict(Stack_final_X_train)))
print(f'OOF RMSE: {rmse:.4f}')

OOF RMSE: 0.7365


In [None]:
submission_ = submission.copy()
submission_['box_off_num'] = Stack_final
submission_.to_csv("stacking2.csv", index=False)

리더보드 점수: 1558407.1664824472

최종 메타 모델이 XGBoost일 때보다 점수가 더 높게 나옴을 알 수 있다.

**Stacking 최종 리더보드 점수**: 1551493.0510696012
- Ridge + XGBoost + Random Forest -> XGBoost로 예측한 모델