# 라이브러리 및 데이터

### 라이브러리

In [5]:
import pandas as pd # 데이터 분석
import numpy as np # 행렬 연산
import random # 시드 제어
from pandas import Series, DataFrame
import sys
import time
import re

import seaborn as sns
import matplotlib.pyplot as plt # 시각화
%matplotlib inline

import datetime
from workalendar.asia import SouthKorea # 한국의 공휴일

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor # 모델링
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor

import shap
import skimage

# 시드
import os
seed = 12345
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
random.seed(seed)

### 데이터셋 처리 및 분류

In [6]:
final_ans = pd.read_csv('section_1week.csv', sep=',')
final_ans_test = pd.read_csv('section_1week_test.csv', sep=',')

In [8]:
final = pd.concat([final_ans, final_ans_test])
final = final.reset_index()
del final['index']

final = final.sort_values(by=['date'], axis=0)

# date의 데이터 형식이 'yyyy-mm-dd'인 string이어서 lightgbm 내부에서 float으로 데이터를 변형시키지
# 못하여 데이터의 형식을 'yyyymmdd'로 변환
date_lst = []
for value in final['date'].values:
    val = re.sub(r"[-]","",value)
    date_lst.append(val)
final['date'] = date_lst
final

Unnamed: 0,date,avg_temp,min_temp,max_temp,rain_d,wind_d,wet_d,snow_d,year,month,...,holiday,power_d,power_d7,power_d6,power_d5,power_d4,power_d3,power_d2,power_d1,power
0,2017-09-05,22.5,21.4,23.7,0.0,2.016667,63.500000,0.0,2017,9,...,0.0,0.133349,0.117071,0.116292,0.127583,0.124333,0.131833,0.138917,0.177417,0.181292
1,2017-09-06,20.8,18.7,23.1,1.9,1.533333,82.458333,0.0,2017,9,...,0.0,0.142524,0.116292,0.127583,0.124333,0.131833,0.138917,0.177417,0.181292,0.170667
2,2017-09-07,22.2,18.4,25.8,0.0,2.095833,80.083333,0.0,2017,9,...,0.0,0.150292,0.127583,0.124333,0.131833,0.138917,0.177417,0.181292,0.170667,0.166458
3,2017-09-08,23.6,21.2,27.4,0.0,1.925000,77.833333,0.0,2017,9,...,0.0,0.155845,0.124333,0.131833,0.138917,0.177417,0.181292,0.170667,0.166458,0.146000
4,2017-09-09,23.4,20.7,27.1,0.0,2.004167,80.958333,0.0,2017,9,...,0.0,0.158940,0.131833,0.138917,0.177417,0.181292,0.170667,0.166458,0.146000,0.250613
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106576,2018-06-26,21.3,19.4,24.0,5.8,3.820833,90.166667,0.0,2018,6,...,0.0,0.386967,0.393125,0.342542,0.340542,0.339744,0.482649,0.428500,0.381667,0.370125
106577,2018-06-27,20.5,18.5,23.0,0.2,2.583333,93.625000,0.0,2018,6,...,0.0,0.383681,0.342542,0.340542,0.339744,0.482649,0.428500,0.381667,0.370125,0.307042
106578,2018-06-28,22.3,20.0,25.1,2.4,2.250000,93.583333,0.0,2018,6,...,0.0,0.378610,0.340542,0.339744,0.482649,0.428500,0.381667,0.370125,0.307042,0.365125
106579,2018-06-29,22.3,20.0,26.0,0.4,1.879167,90.208333,0.0,2018,6,...,0.0,0.382122,0.339744,0.482649,0.428500,0.381667,0.370125,0.307042,0.365125,0.269702


In [11]:
X = final[final_ans.columns.difference(['power'])]
Y = final['power']

X['date'] = X['date'].astype(float)

x_train, x_valtest, y_train, y_valtest = train_test_split(X, Y, test_size = 0.2, shuffle=False, random_state=seed)
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, test_size = 0.5, shuffle=False, random_state=seed)
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape)

(85264, 23) (85264,) (10658, 23) (10658,) (10659, 23) (10659,)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [12]:
print(x_train.dtypes)
print(x_val.dtypes)
print(x_test.dtypes)

avg_temp     float64
date         float64
day            int64
dayofweek      int64
holiday      float64
max_temp     float64
min_temp     float64
month          int64
power_d      float64
power_d1     float64
power_d2     float64
power_d3     float64
power_d4     float64
power_d5     float64
power_d6     float64
power_d7     float64
rain_d       float64
snow_d       float64
week           int64
weekend        int64
wet_d        float64
wind_d       float64
year           int64
dtype: object
avg_temp     float64
date         float64
day            int64
dayofweek      int64
holiday      float64
max_temp     float64
min_temp     float64
month          int64
power_d      float64
power_d1     float64
power_d2     float64
power_d3     float64
power_d4     float64
power_d5     float64
power_d6     float64
power_d7     float64
rain_d       float64
snow_d       float64
week           int64
weekend        int64
wet_d        float64
wind_d       float64
year           int64
dtype: object
avg_te

# lightGBM

In [13]:
train_lgb = lgb.Dataset(x_train, label = y_train)
val_lgb = lgb.Dataset(x_val, label = y_val)
params = {'random_seed':seed, 'bagging_seed':seed,
          'feature_fraction_seed':seed, 'data_random_seed':seed,
          'drop_seed':seed,
          
          'boosting_type':'gbdt', 'objective':'huber',
          'learning_rate':0.001, 'num_leaves':63, 'max_depth':-1,
          'bagging_fraction':0.1, 'feature_fraction':0.8,
          'lambda_l1':10.0, 'lambda_l2':30.0, 'max_bin':255}
model = lgb.train(params, train_lgb, valid_sets = val_lgb,
                  num_boost_round = 2000, early_stopping_rounds = 100,
                  verbose_eval = 200)

Training until validation scores don't improve for 100 rounds
[200]	valid_0's huber: 0.00410296
[400]	valid_0's huber: 0.00316424
[600]	valid_0's huber: 0.00252605
[800]	valid_0's huber: 0.00209568
[1000]	valid_0's huber: 0.00180797
[1200]	valid_0's huber: 0.00160666
[1400]	valid_0's huber: 0.001467
[1600]	valid_0's huber: 0.00136919
[1800]	valid_0's huber: 0.00130067
[2000]	valid_0's huber: 0.00125366
Did not meet early stopping. Best iteration is:
[2000]	valid_0's huber: 0.00125366


In [14]:
lgb_test, lgb_valid = model.predict(x_test), model.predict(x_val)

In [15]:
def smape(array_1, array_2):
    score = 100*np.mean(2*abs(array_1 - array_2)/(abs(array_1)+abs(array_2)))
    return score

In [16]:
# predict과 test set의 smape 계산
lgb_test_smape, lgb_valid_smape = smape(lgb_test, y_test), smape(lgb_valid, y_val)

print(lgb_test_smape)
print(lgb_valid_smape)

15.829889535604028
15.762092349795312


In [None]:
np.savetxt('data/lgb_test.csv', lgb_test, delimiter=',')
np.savetxt('data/y_test.csv', y_test, delimiter=',')

# lightGBM 결과 분석

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(x_test)

In [None]:
shap.summary_plot(shap_values, x_test)

In [None]:
shap.summary_plot(shap_values, x_test, plot_type="bar")

In [None]:
shap.dependence_plot('power_d1', shap_values, x_test)

# Decision tree

In [None]:
tree = DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                             max_features=len(final_ans.columns.difference(['power'])),
                             max_leaf_nodes=None, min_impurity_decrease=0.0, 
                             min_impurity_split=None, min_samples_leaf=1,
                             min_samples_split=2, min_weight_fraction_leaf=0.0,
                             presort='deprecated', random_state=seed, splitter='best')

In [None]:
start = time.time()
tree_fit = tree.fit(x_train, y_train)
predictions = tree_fit.predict(x_test)
validations = tree_fit.predict(x_val)
test_score = smape(predictions, y_test)
val_score = smape(validations, y_val)
print(test_score)
print(val_score)
print("Time: %.2f" % (time.time() - start)) # 코드 실행 시간 계산

# Random forest

In [None]:
rand_tree = RandomForestRegressor(n_estimators=50, criterion='mse', max_depth=None,
                                  min_samples_split=2, min_samples_leaf=1,
                                  min_weight_fraction_leaf=0.0, max_features=len(final_ans.columns.difference(['power'])),
                                  max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None,
                                  bootstrap=True, oob_score=False, n_jobs=None, random_state=seed,
                                  verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)

In [None]:
start = time.time()
rand_tree_fit = rand_tree.fit(x_train, y_train)
predictions = rand_tree_fit.predict(x_test)
validations = rand_tree_fit.predict(x_val)
test_score = smape(predictions, y_test)
val_score = smape(validations, y_val)
print(test_score)
print(val_score)
print("Time: %.2f" % (time.time() - start)) # 코드 실행 시간 계산

# Bagging

In [None]:
bagging_tree = BaggingRegressor(base_estimator=tree, n_estimators=30, *,
                                max_samples=1.0, max_features=1.0, bootstrap=True,
                                bootstrap_features=False, oob_score=False,
                                warm_start=False, n_jobs=None, random_state=seed,
                                verbose=0)

In [None]:
start = time.time()
bagging_tree_fit = bagging_tree.fit(x_train, y_train)
predictions = bagging_tree_fit.predict(x_test)
validations = bagging_tree_fit.predict(x_val)
test_score = smape(predictions, y_test)
val_score = smape(validations, y_val)
print(test_score)
print(val_score)
print("Time: %.2f" % (time.time() - start)) # 코드 실행 시간 계산

# Extra tree

In [None]:
extra_tree = ExtraTreeRegressor(criterion='mse', splitter='random', max_depth=None,
                                min_samples_split=2, min_samples_leaf=1,
                                min_weight_fraction_leaf=0.0, max_features=len(feature_names),
                                random_state=seed, min_impurity_decrease=0.0, min_impurity_split=None,
                                max_leaf_nodes=None, ccp_alpha=0.0)

In [None]:
start = time.time()
extra_tree_fit = extra_tree.fit(x_train, y_train)
predictions = extra_tree_fit.predict(x_test)
validations = extra_tree_fit.predict(x_val)
test_score = smape(predictions, y_test)
val_score = smape(validations, y_val)
print(test_score)
print(val_score)
print("Time: %.2f" % (time.time() - start)) # 코드 실행 시간 계산

# Adaboost

In [None]:
adabst = AdaBoostRegressor(base_estimator=None, n_estimators=50,
                           learning_rate=1.0, loss='linear', random_state=None)

In [None]:
start = time.time()
adabst_fit = adabst.fit(x_train, y_train)
predictions = adabst_fit.predict(x_test)
validations = adabst_fit.predict(x_val)
test_score = smape(predictions, y_test)
val_score = smape(validations, y_val)
print(test_score)
print(val_score)
print("Time: %.2f" % (time.time() - start)) # 코드 실행 시간 계산

# Xgboost

In [None]:
start = time.time()

train_xgb = xgb.DMatrix(data = x_train, label = y_train)
val_xgb = xgb.DMatirx(data = x_val, label = y_val)
test_xgb = xgb.DMatrix(data = x_test, label = y_test)

xgb_parm = {'max_depth': 10, # 트리 깊이
           'learning_rate': 0.01, # Step Size
           'n_estimators': 100, # Number of trees, 트리 생성 개수
           'objective': 'reg:squarederror', # 목적 함수
           'num_class': len(set(y_train)) + 1}
# 파라미터 추가, Label must be in [0, num_class] -> num_class보다 1 커야한다.

xgb_model = xgb.train(params = xgb_param, dtrain = train_xgb) # 학습 진행

xgb_model_val_predict = xgb_model.predict(val_xgb) # 검증 데이터 예측
xgb_model_predict = xgb_model.predict(test_xgb) # 평가 데이터 예측

xgb_test_smape, xgb_valid_smape = smape(xgb_model_predict, y_test), smape(xgb_model_val_predict, y_val)

print(xgb_test_smape)
print(xgb_valid_smape)
print("Time: %.2f" % (time.time() - start)) # 코드 실행 시간 계산

# Catboost

In [None]:
start = time.time()
train_cb = cb.Pool(data = x_train, label = y_train)
val_cb = cb.Pool(data = x_val, label = y_val)
test_cb = cb.Pool(data = x_test, label = y_test)

cb_parm = {'max_depth': 10, # 트리 깊이
           'learning_rate': 0.01, # Step Size
           'n_estimators': 100, # Number of trees, 트리 생성 개수
           'eval_metric': 'SMAPE', # 목적 함수
           'loss_function': 'RMSE'}
# 파라미터 추가, Label must be in [0, num_class] -> num_class보다 1 커야한다.

cb_model = cb.train(params = cb_param, pool = train_cb) # 학습 진행

cb_model_val_predict = cb_model.predict(val_cb) # 검증 데이터 예측
cb_model_predict = cb_model.predict(test_cb) # 평가 데이터 예측

cb_test_smape, cb_valid_smape = smape(cb_model_predict, y_test), smape(cb_model_val_predict, y_val)

print(cb_test_smape)
print(cb_valid_smape)
print("Time: %.2f" % (time.time() - start)) # 코드 실행 시간 계산