# lightGBM 실험

In [1]:
import pandas as pd # 데이터 분석
import numpy as np # 행렬 연산
import random # 시드 제어
from pandas import Series, DataFrame
import sys
import time
import re

import seaborn as sns
import matplotlib.pyplot as plt # 시각화
%matplotlib inline

import datetime
from workalendar.asia import SouthKorea # 한국의 공휴일

import lightgbm as lgb
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor # 모델링
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor

import shap
import skimage

# 시드
import os
seed = 12345
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
random.seed(seed)

# load dataset
final_ans = pd.read_csv('section_2week.csv', sep=',')
final_ans_test = pd.read_csv('section_2week_test.csv', sep=',')

final = pd.concat([final_ans, final_ans_test])
final = final.reset_index()
del final['index']

final = final.sort_values(by=['date'], axis=0)

# date의 데이터 형식이 'yyyy-mm-dd'인 string이어서 lightgbm 내부에서 float으로 데이터를 변형시키지
# 못하여 데이터의 형식을 'yyyymmdd'로 변환
date_lst = []
for value in final['date'].values:
    val = re.sub(r"[-]","",value)
    date_lst.append(val)
final['date'] = date_lst

X = final[final_ans.columns.difference(['power'])]
Y = final['power']

X['date'] = X['date'].astype(float)

x_train, x_valtest, y_train, y_valtest = train_test_split(X, Y, test_size = 0.2, shuffle=False, random_state=seed)
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, test_size = 0.5, shuffle=False, random_state=seed)
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape)

(83271, 30) (83271,) (10409, 30) (10409,) (10409, 30) (10409,)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# lightGBM

In [2]:
train_lgb = lgb.Dataset(x_train, label = y_train)
val_lgb = lgb.Dataset(x_val, label = y_val)
params = {'random_seed':seed, 'bagging_seed':seed,
          'feature_fraction_seed':seed, 'data_random_seed':seed,
          'drop_seed':seed,
          'num_iterations':400,
          'boosting_type':'gbdt', 'objective':'regression_l1',
          'learning_rate':0.05, 'num_leaves':100, 'max_depth':-1,
          'bagging_fraction':0.1, 'feature_fraction':0.8,
          'lambda_l1':0.0, 'lambda_l2':15.0, 'max_bin':300}
model = lgb.train(params, train_lgb, valid_sets = val_lgb,
                  num_boost_round = 2000, early_stopping_rounds = 100,
                  verbose_eval = 200)

lgb_test, lgb_valid = model.predict(x_test), model.predict(x_val)

def smape(array_1, array_2):
    score = 100*np.mean(2*abs(array_1 - array_2)/(abs(array_1)+abs(array_2)))
    return score

# predict과 test set의 smape 계산
lgb_test_smape, lgb_valid_smape = smape(lgb_test, y_test), smape(lgb_valid, y_val)
print('result:')
print(lgb_valid_smape)
print(lgb_test_smape)

Found `num_iterations` in params. Will use it instead of argument


Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 0.0317754
[400]	valid_0's l1: 0.0317199
Did not meet early stopping. Best iteration is:
[392]	valid_0's l1: 0.0317139
result:
12.366641590450568
12.771674437698572


In [3]:
np.savetxt('data/lgb_test.csv', lgb_test, delimiter=',')
np.savetxt('data/y_test.csv', y_test, delimiter=',')

## 기상데이터 제거 lightGBM

In [13]:
col = ['date','day','dayofweek','holiday','min_temp','month','power_d',
       'power_d1','power_d10','power_d11','power_d12','power_d13','power_d14',
       'power_d2','power_d3','power_d4','power_d5','power_d6','power_d7',
       'power_d8','power_d9','week','weekend','year']
X_not = X[col]
X_not['date'] = X_not['date'].astype(float)

x_train, x_valtest, y_train, y_valtest = train_test_split(X_not, Y, test_size = 0.2, shuffle=False, random_state=seed)
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, test_size = 0.5, shuffle=False, random_state=seed)
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape)

(83271, 24) (83271,) (10409, 24) (10409,) (10409, 24) (10409,)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
train_lgb = lgb.Dataset(x_train, label = y_train)
val_lgb = lgb.Dataset(x_val, label = y_val)
params = {'random_seed':seed, 'bagging_seed':seed,
          'feature_fraction_seed':seed, 'data_random_seed':seed,
          'drop_seed':seed,
          'num_iterations':400,
          'boosting_type':'gbdt', 'objective':'regression_l1',
          'learning_rate':0.05, 'num_leaves':100, 'max_depth':-1,
          'bagging_fraction':0.1, 'feature_fraction':0.8,
          'lambda_l1':0.0, 'lambda_l2':15.0, 'max_bin':300}
model = lgb.train(params, train_lgb, valid_sets = val_lgb,
                  num_boost_round = 2000, early_stopping_rounds = 100,
                  verbose_eval = 200)

lgb_test, lgb_valid = model.predict(x_test), model.predict(x_val)

def smape(array_1, array_2):
    score = 100*np.mean(2*abs(array_1 - array_2)/(abs(array_1)+abs(array_2)))
    return score

# predict과 test set의 smape 계산
lgb_test_smape, lgb_valid_smape = smape(lgb_test, y_test), smape(lgb_valid, y_val)
print('result:')
print(lgb_valid_smape)
print(lgb_test_smape)

Found `num_iterations` in params. Will use it instead of argument


Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 0.0316913
Early stopping, best iteration is:
[275]	valid_0's l1: 0.0316692
result:
12.35703830075168
12.890580864703786
