# 1. Data & Library

In [37]:
!pip install bayesian-optimization
!pip install chart-studio

# 연산
import numpy as np
import pandas as pd

# 기타
import os
import glob
import warnings
warnings.filterwarnings("ignore")

# 계층적 샘플링
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.multiclass import type_of_target
from sklearn.model_selection import StratifiedKFold

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected=True)
import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot, init_notebook_mode
plotly.io.renderers.default = 'colab' # colab에서 plotly가 돌아가기 위해

# 모델링
from lightgbm import LGBMRegressor
from bayes_opt import BayesianOptimization
import lightgbm as lgb

# 랜덤 시드 고정
random_seed = 42



In [8]:
train_df = pd.read_csv('/content/drive/MyDrive/DACON/태양광/재구조화/train_재구조화')
test_df = pd.read_csv('/content/drive/MyDrive/DACON/태양광/재구조화/test_재구조화')
print(f'train shape: {train_df.shape} \ntest shape:  {test_df.shape}')

train shape: (52176, 192) 
test shape:  (3888, 190)


# 2. Stratified Sampling 

* submission 제출 시에는 TARGET 1,2에 대해서 모두 계층적 샘플링을 진행
* 중복되는 부분을 제외하기 위해 편의상 TARGET 1에 대해서만 코드가 존재

## TARGET 1

In [28]:
random_seed = 42

In [26]:
# TARGET을 기준으로 계층적 샘플링을 하기 위해
def target_scaler(x):
  for i in range(200):
    if (x>i) & (x<=i+1):
      return i
    elif x==0:
      return 0

train_df['TARGET_day7_cata']=train_df['TARGET_day7'].apply(target_scaler)
# train_df['TARGET_day7_cata'].unique()
# array([ 0,  7, 15, 23, 30, 36, 41, 45, 48, 49, 47, 43, 39, 33, 26, 19, 10,
#        3,  8, 12, 17, 20, 28, 29, 44, 11,  4, 16, 25, 37, 38,  1,  5,  6,
#       14, 40, 32,  9,  2, 27, 22, 35, 34, 18, 13, 24, 31, 50, 51, 46, 21,
#       52, 42, 53, 55, 56, 54, 57, 59, 58, 60, 61, 62, 64, 63, 67, 68, 66,
#       65, 69, 71, 70, 72, 75, 76, 74, 77, 78, 73, 80, 79, 82, 83, 81, 85,
#       86, 84, 90, 91, 88, 89, 87, 92, 93, 94, 95, 96, 97, 98, 99])

label_encoder = LabelEncoder()
train_df.TARGET_day7_cata = label_encoder.fit_transform(train_df.TARGET_day7_cata)
print(type_of_target(train_df.TARGET_day7_cata)) # type 확인

# 연도별 앙상블을 위해 3년 데이터를 3분할
train_df1=train_df.iloc[:48*365,:]
train_df2=train_df.iloc[48*365:365*48*2,:]
train_df3=train_df.iloc[365*48*2:,:]

stf=StratifiedKFold(n_splits=5, shuffle=False, random_state=random_seed)

#마지막 fold를 사용, 사실 모든 fold를 활용해야 하지만 Bayesian Optimization과 LightGBM으로 qunatile loss를 metric으로 kFold를 구현하지 못해서 마지막 fold만 이용
for train_index1,valid_index1 in stf.split(train_df1, train_df1.TARGET_day7_cata):
  print(train_index1, valid_index1)

for train_index2,valid_index2 in stf.split(train_df2, train_df2.TARGET_day7_cata):
  print(train_index2, valid_index2)

for train_index3,valid_index3 in stf.split(train_df3, train_df3.TARGET_day7_cata):
  print(train_index3, valid_index3)

multiclass
[ 1997  2084  2089 ... 17517 17518 17519] [   0    1    2 ... 7077 7514 7558]
[    0     1     2 ... 17517 17518 17519] [1997 2084 2089 ... 8707 8769 9055]
[    0     1     2 ... 17517 17518 17519] [ 5402  5403  5680 ... 12353 12366 12416]
[    0     1     2 ... 17517 17518 17519] [ 7272  7752  7798 ... 15719 15766 15910]
[    0     1     2 ... 15719 15766 15910] [ 7800  7801  7945 ... 17517 17518 17519]
[ 2235  2322  2465 ... 17517 17518 17519] [   0    1    2 ... 6887 6933 6981]
[    0     1     2 ... 17517 17518 17519] [2235 2322 2465 ... 8844 8892 8940]
[    0     1     2 ... 17517 17518 17519] [ 4493  4877  5068 ... 12687 12943 12983]
[    0     1     2 ... 17517 17518 17519] [ 7422  7607  7656 ... 15957 16006 16153]
[    0     1     2 ... 15957 16006 16153] [ 7608  7609  7657 ... 17517 17518 17519]
[ 1611  1612  1702 ... 17133 17134 17135] [   0    1    2 ... 6411 6552 7367]
[    0     1     2 ... 17133 17134 17135] [1611 1612 1702 ... 8817 8867 9036]
[    0     1     

In [27]:
train_df2 = train_df2.reset_index(drop=True)
train_df3 = train_df3.reset_index(drop=True)

del train_df1['TARGET_day7_cata']
del train_df2['TARGET_day7_cata']
del train_df3['TARGET_day7_cata']

X_train_1 = train_df1[train_df1.index.isin(train_index1)].drop(columns=['TARGET_day7','TARGET_day8'])
Y_train_1 = train_df1[train_df1.index.isin(train_index1)]['TARGET_day7']
X_valid_1 = train_df1[train_df1.index.isin(valid_index1)].drop(columns=['TARGET_day7','TARGET_day8'])
Y_valid_1 = train_df1[train_df1.index.isin(valid_index1)]['TARGET_day7']

X_train_2 = train_df2[train_df2.index.isin(train_index2)].drop(columns=['TARGET_day7','TARGET_day8'])
Y_train_2 = train_df2[train_df2.index.isin(train_index2)]['TARGET_day7']
X_valid_2 = train_df2[train_df2.index.isin(valid_index2)].drop(columns=['TARGET_day7','TARGET_day8'])
Y_valid_2 = train_df2[train_df2.index.isin(valid_index2)]['TARGET_day7']

X_train_3 = train_df3[train_df3.index.isin(train_index3)].drop(columns=['TARGET_day7','TARGET_day8'])
Y_train_3 = train_df3[train_df3.index.isin(train_index3)]['TARGET_day7']
X_valid_3 = train_df3[train_df3.index.isin(valid_index3)].drop(columns=['TARGET_day7','TARGET_day8'])
Y_valid_3 = train_df3[train_df3.index.isin(valid_index3)]['TARGET_day7']

print(X_train_1.shape, Y_train_1.shape, X_valid_1.shape, Y_valid_1.shape)
print(X_train_2.shape, Y_train_2.shape, X_valid_2.shape, Y_valid_2.shape)
print(X_train_3.shape, Y_train_3.shape, X_valid_3.shape, Y_valid_3.shape)

(14016, 190) (14016,) (3504, 190) (3504,)
(14016, 190) (14016,) (3504, 190) (3504,)
(13709, 190) (13709,) (3427, 190) (3427,)


In [34]:
Y_valid_1.describe()

count    3504.000000
mean       17.285182
std        25.290156
min         0.000000
25%         0.000000
50%         0.000000
75%        30.523684
max        97.849989
Name: TARGET_day7, dtype: float64

In [33]:
Y_train_1.describe()

count    14016.000000
mean        17.276665
std         25.265685
min          0.000000
25%          0.000000
50%          0.000000
75%         30.590205
max         97.666652
Name: TARGET_day7, dtype: float64

# 3. Bayesian Optimization & LightGBM

* 제출 시에는 TARGET 1의 3년치 + TARGET 2의 3년치에 대해 모두 Bayesian Optimization & LightGBM 진행
* 중복을 제외하기 위해 TARGET 1의 첫 연도 데이터에 대해서만 코드 존재

## TARGET 1 

#### year 1

In [38]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

bounds = {
    'learning_rate': (0.001, 1.0), # 학습률
    'num_leaves': (16, 1024), # 
    'feature_fraction' : (0.1, 0.9),
    'bagging_fraction' : (0.8, 1),
    'max_depth': (5, 30),   
    'min_data_in_leaf': (16, 1024)
}

for q in quantiles:
  def train_model(learning_rate, num_leaves, bagging_fraction, feature_fraction, 
                      min_data_in_leaf,max_depth,alpha=q):
    params = {'objective':'quantile','alpha':alpha,'boosting_type':'dart'}
    params['learning_rate'] = max(min(learning_rate, 1), 0)
    params["num_leaves"] = int(round(num_leaves))
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = int(round(max_depth))
    params['min_data_in_leaf'] = int(round(min_data_in_leaf))

    trn_data = lgb.Dataset(X_train_1, Y_train_1)
    val_data = lgb.Dataset(X_valid_1, Y_valid_1)
    model = lgb.train(params, trn_data, 500, valid_sets = [trn_data, val_data], verbose_eval=100) # 실제 제출 시에는 500 대신 3000을 사용

    return -model.best_score['valid_1']['quantile']
      
  print('#### Predict {}'.format(q))
  optimizer = BayesianOptimization(f=train_model, pbounds=bounds, random_state=random_seed)
  optimizer.maximize(init_points=5, n_iter=5) # 실제 제출 시에는 n_iter = 35을 사용

  print('Best Pinball-loss score:', -optimizer.max['target'])

  opt_params_1 = optimizer.max['params']
  opt_params_1['max_depth']=int(round(opt_params_1['max_depth']))
  opt_params_1['num_leaves']=int(round(opt_params_1['num_leaves']))
  opt_params_1['min_data_in_leaf']=int(round(opt_params_1['min_data_in_leaf']))
  
  def LGBM_1(q, X_train, Y_train, X_valid, Y_valid, X_test):
    params = opt_params_1
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', boosting_type='dart', alpha=q,  n_estimators=3000, **params, random_state=random_seed)             
    model.fit(X_train, Y_train, eval_metric = ['quantile'], 
                eval_set=[(X_valid, Y_valid)], verbose=1000)
        # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

    # Target 예측

  def train_data(X_train, Y_train, X_valid, Y_valid, X_test):
    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()
    pred, model = LGBM_1(q, X_train_1, Y_train_1, X_valid_1, Y_valid_1, test_df)
    LGBM_models.append(model)
    LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1)
    LGBM_actual_pred.columns=[q]
    return LGBM_models, LGBM_actual_pred

  models_1, results_1 = train_data(X_train_3, Y_train_3, X_valid_3, Y_valid_3, test_df)
  # results_1.to_csv("/content/drive/MyDrive/DACON/태양광/quantiles/dart/year3/year3_day1_results_{}.csv".format(q), index=False)

#### Predict 0.1
|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_da... | num_le... |
-------------------------------------------------------------------------------------------------
[100]	training's quantile: 0.737936	valid_1's quantile: 1.39787
[200]	training's quantile: 0.690109	valid_1's quantile: 1.40373
[300]	training's quantile: 0.650464	valid_1's quantile: 1.3822
[400]	training's quantile: 0.622694	valid_1's quantile: 1.39858
[500]	training's quantile: 0.605644	valid_1's quantile: 1.39147
| [0m 1       [0m | [0m-1.391   [0m | [0m 0.8749  [0m | [0m 0.8606  [0m | [0m 0.7323  [0m | [0m 19.97   [0m | [0m 173.3   [0m | [0m 173.2   [0m |
[100]	training's quantile: 0.54735	valid_1's quantile: 1.42056
[200]	training's quantile: 0.484931	valid_1's quantile: 1.4123
[300]	training's quantile: 0.447579	valid_1's quantile: 1.42349
[400]	training's quantile: 0.412595	valid_1's quantile: 1.42801
[500]	training's quantile: 0.398531	valid_1's quanti

# 4. Submission

## TARGET 1 앙상블

In [40]:
test_files_1  = sorted(glob.glob('/content/drive/MyDrive/DACON/태양광/quantiles/dart/year1/*csv'))
test_files_2  = sorted(glob.glob('/content/drive/MyDrive/DACON/태양광/quantiles/dart/year2/*csv'))
test_files_3  = sorted(glob.glob('/content/drive/MyDrive/DACON/태양광/quantiles/dart/year3/*csv'))

In [41]:
result_1 = []
result_1 = pd.DataFrame()

for i in range(0,len(test_files_1)):
  result_1 = pd.concat([result_1, pd.read_csv(test_files_1[i])], axis=1)

result_2 = []
result_2 = pd.DataFrame()

for i in range(0,len(test_files_2)):
  result_2 = pd.concat([result_2, pd.read_csv(test_files_2[i])], axis=1)

result_3 = []
result_3 = pd.DataFrame()

for i in range(0,len(test_files_3)):
  result_3 = pd.concat([result_3, pd.read_csv(test_files_3[i])], axis=1)

print(result_1.shape, result_2.shape, result_3.shape)

(3888, 9) (3888, 9) (3888, 9)


In [44]:
day_1_res = []
day_1_res = pd.DataFrame(columns=['0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9'])

for i in result_1.columns:
  day_1_res[i] = (result_1[i] + result_2[i] + result_3[i]) / 3

def to_zero(x):
  if x < 0:
    return 0
  else:
    return x

for i in result_1.columns:
  day_1_res[i] = day_1_res[i].apply(to_zero)
  
display(day_1_res)

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.143333
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.140000
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.150000
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.146667
4,0.0,0.013333,0.0,0.0,0.0,0.0,0.0,0.0,0.150000
...,...,...,...,...,...,...,...,...,...
3883,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.003333
3884,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3885,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3886,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
