<a href="https://colab.research.google.com/github/jwc22-11/lgdacon/blob/main/HM/LGBMRegressor(scaled).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import sklearn.metrics as metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import PolynomialFeatures

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [4]:
%ls

[0m[01;34msample_data[0m/


In [7]:
train_df = pd.read_csv('train.csv').drop(columns=['ID'])
test_df = pd.read_csv('test.csv').drop(columns=['ID'])

In [8]:
train_df

Unnamed: 0,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,X_10,...,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,70.544,103.320,67.47,1,101.892,74.983,29.45,62.38,245.71,0.0,...,29.632,16.083,4.276,-25.381,-25.529,-22.769,23.792,-25.470,-25.409,-25.304
1,69.524,103.321,65.17,1,101.944,72.943,28.73,61.23,233.61,0.0,...,33.179,16.736,3.229,-26.619,-26.523,-22.574,24.691,-26.253,-26.497,-26.438
2,72.583,103.320,64.07,1,103.153,72.943,28.81,105.77,272.20,0.0,...,31.801,17.080,2.839,-26.238,-26.216,-22.169,24.649,-26.285,-26.215,-26.370
3,71.563,103.320,67.57,1,101.971,77.022,28.92,115.21,255.36,0.0,...,34.503,17.143,3.144,-25.426,-25.079,-21.765,24.913,-25.254,-25.021,-25.345
4,69.524,103.320,63.57,1,101.981,70.904,29.68,103.38,241.46,0.0,...,32.602,17.569,3.138,-25.376,-25.242,-21.072,25.299,-25.072,-25.195,-24.974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,66.465,103.320,62.27,1,103.150,66.825,30.20,77.83,298.05,0.0,...,29.194,16.582,3.410,-26.486,-26.581,-22.772,24.261,-26.491,-26.584,-26.580
39603,66.465,103.321,62.77,1,102.021,66.825,29.21,102.25,270.67,0.0,...,29.859,15.659,3.406,-27.308,-27.203,-24.674,23.427,-27.250,-27.334,-27.325
39604,68.504,103.320,64.67,1,103.144,68.864,29.96,102.61,198.07,0.0,...,24.720,16.823,3.215,-26.502,-26.687,-22.577,24.301,-26.388,-26.425,-26.601
39605,66.465,103.320,63.67,1,102.025,67.845,30.30,112.60,275.52,0.0,...,26.412,15.757,4.216,-26.760,-26.634,-24.066,23.305,-26.536,-26.751,-26.635


In [9]:
test_df

Unnamed: 0,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,X_10,...,X_47,X_48,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
0,68.504,103.321,76.67,1,101.867,73.963,30.51,63.57,239.80,0.0,...,1,1,17227.63,138.130429,129.460682,141.506570,133.427229,129.711498,133.138096,121.859684
1,67.485,103.320,69.37,1,101.992,67.845,28.03,116.99,189.23,0.0,...,1,1,17134.53,136.148839,128.266277,145.911745,131.196417,132.411480,133.629025,124.178623
2,69.524,103.320,68.97,1,101.884,77.022,29.65,205.68,214.93,0.0,...,1,1,14860.83,120.447446,119.988804,132.099908,120.450155,130.051708,128.252972,114.475628
3,69.524,103.320,65.87,1,101.866,73.963,28.15,103.38,180.80,0.0,...,1,1,15252.53,133.994695,125.069180,147.507669,123.142653,125.963665,139.666592,126.589253
4,73.603,103.321,66.67,1,101.891,74.983,29.92,71.20,231.93,0.0,...,1,1,10752.23,137.918202,135.116192,138.600473,127.173033,137.252712,134.411335,124.020016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39603,68.504,103.320,63.97,1,103.157,68.864,29.49,116.35,284.16,0.0,...,1,1,62123.53,127.741246,126.494312,139.119905,125.271109,128.284572,140.176945,128.292843
39604,68.504,103.320,61.37,1,103.137,68.864,32.29,116.28,272.41,0.0,...,1,1,61844.13,127.767377,124.062809,138.238664,119.879393,127.322529,137.312047,131.570614
39605,69.524,103.320,63.67,1,103.149,69.884,30.00,113.05,295.54,0.0,...,1,1,60277.53,128.593640,124.774037,138.659624,123.999571,126.075542,135.656132,127.671108
39606,67.485,103.321,61.77,1,103.148,67.845,32.05,115.05,267.26,0.0,...,1,1,60236.73,121.110646,125.471699,134.989984,120.889578,129.296909,132.673977,131.882893


In [10]:
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

In [11]:
poly = PolynomialFeatures(include_bias=False)

poly.fit(train_x)
train_poly = poly.transform(train_x)

In [12]:
train_poly.shape

(39607, 1652)

In [13]:
test_poly = poly.transform(test_df)

In [14]:
test_poly.shape

(39608, 1652)

In [15]:
ss = StandardScaler()
ss.fit(train_poly)
train_scaled = ss.transform(train_poly)
test_scaled = ss.transform(test_poly)

In [16]:
train_input, val_input, train_target, val_target = train_test_split(
    train_scaled, train_y, test_size=0.2, random_state=42)

In [17]:
print(f'train_input : {train_input.shape}')
print(f'val_input : {val_input.shape}')
print(f'train_target  : {train_target.shape}')
print(f'val_target  : {val_target.shape}')

train_input : (31685, 1652)
val_input : (7922, 1652)
train_target  : (31685, 14)
val_target  : (7922, 14)


In [18]:
poly.get_feature_names_out()

array(['X_01', 'X_02', 'X_03', ..., 'X_55^2', 'X_55 X_56', 'X_56^2'],
      dtype=object)

In [19]:
#Regression Model Fit
lgbm = MultiOutputRegressor(LGBMRegressor(n_estimators=1000,
                                            learning_rate=0.01, 
                                            subsample=0.8, 
                                            colsample_bytree = 0.8, 
                                            max_depth=7)).fit(train_input, train_target)
print('Done.')

Done.


In [20]:
pred = lgbm.predict(val_input)
print('Done.')

Done.


In [21]:
val_target = val_target.to_numpy()

In [22]:
lg_nrmse(val_target, pred)

1.9347243256593276

In [23]:
preds = lgbm.predict(test_scaled)

In [24]:
submit = pd.read_csv('sample_submission.csv')

In [25]:
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')

Done.


In [26]:
submit.to_csv('submit_lgbm_pf2.csv', index=False)

In [None]:
lgbm = pd.read_csv('submit_lgbm_pf2.csv').drop(['ID'], axis=1)
rf = pd.read_csv('m0818.csv').drop(['ID'],axis=1)
xgb = pd.read_csv('m0819b.csv').drop(['ID'],axis=1)

In [None]:
lgbm

In [None]:
rf

In [None]:
xgb

In [None]:
sum_ = lgbm + xgb + rf ;
div_ = sum_ / 3 ; div_

In [None]:
submit_ = pd.concat([submit[['ID']], div_], axis=1)
submit_.to_csv('assemble_b.csv', index=False)