<a href="https://colab.research.google.com/github/jwc22-11/lgdacon/blob/main/HM/LGBMRegressor_RGS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
%ls

[0m[01;34msample_data[0m/  sample_submission.csv  test.csv  train.csv


In [4]:
train_df = pd.read_csv('train.csv')

In [5]:
train_df

Unnamed: 0,ID,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,...,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,TRAIN_00001,70.544,103.320,67.47,1,101.892,74.983,29.45,62.38,245.71,...,29.632,16.083,4.276,-25.381,-25.529,-22.769,23.792,-25.470,-25.409,-25.304
1,TRAIN_00002,69.524,103.321,65.17,1,101.944,72.943,28.73,61.23,233.61,...,33.179,16.736,3.229,-26.619,-26.523,-22.574,24.691,-26.253,-26.497,-26.438
2,TRAIN_00003,72.583,103.320,64.07,1,103.153,72.943,28.81,105.77,272.20,...,31.801,17.080,2.839,-26.238,-26.216,-22.169,24.649,-26.285,-26.215,-26.370
3,TRAIN_00004,71.563,103.320,67.57,1,101.971,77.022,28.92,115.21,255.36,...,34.503,17.143,3.144,-25.426,-25.079,-21.765,24.913,-25.254,-25.021,-25.345
4,TRAIN_00005,69.524,103.320,63.57,1,101.981,70.904,29.68,103.38,241.46,...,32.602,17.569,3.138,-25.376,-25.242,-21.072,25.299,-25.072,-25.195,-24.974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,TRAIN_39603,66.465,103.320,62.27,1,103.150,66.825,30.20,77.83,298.05,...,29.194,16.582,3.410,-26.486,-26.581,-22.772,24.261,-26.491,-26.584,-26.580
39603,TRAIN_39604,66.465,103.321,62.77,1,102.021,66.825,29.21,102.25,270.67,...,29.859,15.659,3.406,-27.308,-27.203,-24.674,23.427,-27.250,-27.334,-27.325
39604,TRAIN_39605,68.504,103.320,64.67,1,103.144,68.864,29.96,102.61,198.07,...,24.720,16.823,3.215,-26.502,-26.687,-22.577,24.301,-26.388,-26.425,-26.601
39605,TRAIN_39606,66.465,103.320,63.67,1,102.025,67.845,30.30,112.60,275.52,...,26.412,15.757,4.216,-26.760,-26.634,-24.066,23.305,-26.536,-26.751,-26.635


In [6]:
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

In [7]:
# dataframe -> numpy array
train_x = train_x.to_numpy()
train_y = train_y.to_numpy()

In [8]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [9]:
train_input, val_input, train_target, val_target = train_test_split(
    train_x, train_y, test_size=0.1, random_state=42)

In [10]:
print(train_input.shape)
print(val_input.shape)
print(train_target.shape)
print(val_target.shape)

(35646, 56)
(3961, 56)
(35646, 14)
(3961, 14)


In [11]:
# RandomizedSearchCV
params = {'estimator__n_estimators': [100, 200, 500, 1000, 2000],
            'estimator__learning_rate': [0.1, 0.08, 0.05, 0.01],
            'estimator__max_depth': [6, 7, 8],
            'estimator__colsample_bytree': [0.8, 0.9, 1.0],
            'estimator__subsample': [0.75, 0.8, 0.9, 1.0]}

lg = LGBMRegressor()
lgbm_cv = RandomizedSearchCV(MultiOutputRegressor(lg), params, random_state=42, cv=3, n_iter=25)

In [12]:
lgbm_cv.fit(train_input, train_target)

RandomizedSearchCV(cv=3,
                   estimator=MultiOutputRegressor(estimator=LGBMRegressor()),
                   n_iter=25,
                   param_distributions={'estimator__colsample_bytree': [0.8,
                                                                        0.9,
                                                                        1.0],
                                        'estimator__learning_rate': [0.1, 0.08,
                                                                     0.05,
                                                                     0.01],
                                        'estimator__max_depth': [6, 7, 8],
                                        'estimator__n_estimators': [100, 200,
                                                                    500, 1000,
                                                                    2000],
                                        'estimator__subsample': [0.75, 0.8, 0.9,
                

In [13]:
print(lgbm_cv.best_params_)

{'estimator__subsample': 0.8, 'estimator__n_estimators': 1000, 'estimator__max_depth': 7, 'estimator__learning_rate': 0.01, 'estimator__colsample_bytree': 0.8}


In [14]:
#Regression Model Fit
lgbm = MultiOutputRegressor(LGBMRegressor(n_estimators=1000,
                                            learning_rate=0.01, 
                                            subsample=0.8, 
                                            colsample_bytree = 0.8, 
                                            max_depth=7)).fit(train_input, train_target)
print('Done.')

Done.


In [15]:
pred = lgbm.predict(val_input)
print('Done.')

Done.


In [16]:
lg_nrmse(val_target, pred)

1.9318340731739099

In [None]:
test_x = pd.read_csv('test.csv').drop(columns=['ID'])

In [None]:
preds = lgbm.predict(test_x)

In [None]:
submit = pd.read_csv('sample_submission.csv')

In [None]:
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')

Done.


In [None]:
submit.to_csv('submit_lgbm_cv.csv', index=False)