<a href="https://colab.research.google.com/github/jwc22-11/lgdacon/blob/main/HM/re_LGBMRegressor_RGS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

import sklearn.metrics as metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

# hyperparameter tuning
from scipy.stats import randint
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import RandomizedSearchCV

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [4]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [5]:
train_df = pd.read_csv('train.csv').drop(columns=['ID'])
test_df = pd.read_csv('test.csv').drop(columns=['ID'])

In [6]:
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y')

In [7]:
ss = StandardScaler()
ss.fit(train_x)
train_scaled = ss.transform(train_x)
test_scaled = ss.transform(test_df)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(
    train_scaled, train_y, test_size=0.1, random_state=42)

In [9]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(35646, 56)
(3961, 56)
(35646, 14)
(3961, 14)


In [17]:
from sklearn.pipeline import Pipeline

regressor = MultiOutputRegressor(LGBMRegressor(random_state = 42))
model = Pipeline([('regression', regressor)])

# hyperparameter tuning
param_grid = {
    'regression__estimator__num_leaves' : randint(10, 100),
    'regression__estimator__max_depth' : randint(1, 10),
    'regression__estimator__min_child_samples' : randint(10, 40),
    'regression__estimator__n_estimators' : randint(50, 300),
    'regression__estimator__learning_rate' : loguniform(1e-3, 0.1),
    'regression__estimator__subsample' : loguniform(0.6, 1.0),
    'regression__estimator__subsample_freq' : randint(1, 5)
}

In [18]:
gs_lgbm = RandomizedSearchCV(model, param_grid, random_state=42, cv=3, n_iter=25)

In [19]:
gs_lgbm = gs_lgbm.fit(x_train, y_train)

In [20]:
gs_lgbm.best_estimator_

Pipeline(steps=[('regression',
                 MultiOutputRegressor(estimator=LGBMRegressor(learning_rate=0.02878805718308923,
                                                              max_depth=8,
                                                              min_child_samples=31,
                                                              n_estimators=268,
                                                              num_leaves=44,
                                                              random_state=42,
                                                              subsample=0.7636802181498231,
                                                              subsample_freq=1)))])

In [21]:
# model save & load
import joblib

lgbr = gs_lgbm.best_estimator_

joblib.dump(lgbr, 'lgbm_model.joblib')
lgbr = joblib.load('lgbm_model.joblib')

In [22]:
y_pred = lgbr.predict(x_test)

In [27]:
y_test = y_test.to_numpy()

In [28]:
lg_nrmse(y_test,y_pred)

1.932152280373224