# DACON - 태양광 발전량 예측 competition
***

## 1. Package Import

In [343]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import itertools
import time

import statsmodels.api as sm
from statsmodels.regression.quantile_regression import QuantReg as qreg
import statsmodels.formula.api as smf

from tqdm import tnrange, tqdm_notebook

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import os
from lightgbm import LGBMRegressor
import datetime
#from skgarden import RandomForestQuantileRegressor, DecisionTreeQuantileRegressor, ExtraTreeQuantileRegressor, ExtraTreesQuantileRegressor

***

## 2. Load Data

In [344]:
train = pd.read_csv('/Users/kisehyun/DACON/SUN/train/train.csv')

In [345]:
for i in range(81) :
    globals() [f'test_{i}'] = pd.read_csv(f'/Users/kisehyun/DACON/SUN/test/{i}.csv')
    globals() [f'test_{i}']['sep_day'] = i

In [346]:
### 각 데이터를 하나의 데이터로 통합
test = pd.DataFrame()
idx_list = []
for i in range(81):
    test = pd.concat([test, globals()[f'test_{i}']], axis = 0, ignore_index = True)

In [347]:
submission = pd.read_csv('sample_submission.csv')

***

## 3. Make DataSet

- after 1 day

- after 2 days

### Loss Function Definition

In [348]:
def pb_loss(true, pred, q) :
    
    L_list = []
    
    for i in range(len(true)) :
        
        if true.iloc[i] >= pred[i] :
            L = true.iloc[i] - pred[i]
            L *= q
            
        else :
            L1 = pred[i] - true.iloc[i]
            L2 = 1 - q
            L = L1 * L2
            
        L_list.append(L)
        
    return np.mean(L_list)

In [349]:
def make_data() :
    ### 1일 후 데이터
    train_after_1 = train.loc[train.Day <= 1093]
    train_after_1['TARGET1'] = list(train.loc[train.Day >= 1].TARGET)
    
    ### 2일 후 데이터
    train_after_2 = train.loc[train.Day <= 1092]
    train_after_2['TARGET2'] = list(train.loc[train.Day >= 2].TARGET)
    
    ### after 1 day data 
    a_1day = test.loc[test.Day != 6].drop(['Day', 'Minute','sep_day'], axis = 1)
    
    ### after 2 day data 
    a_2day = test.query('Day not in [5, 6]').drop(['Day', 'Minute','sep_day'], axis = 1)
    
    ### after 1 day train data
    X_1 = train_after_1.drop(['Day','Minute', 'TARGET1'], axis = 1)
    y_1 = train_after_1.TARGET1
    
    ### after 2 days train data
    X_2 = train_after_2.drop(['Day','Minute', 'TARGET2'], axis = 1)
    y_2 = train_after_2.TARGET2
    
    ### actual values
    real_1 = test.loc[test.Day != 0].TARGET
    real_2 = test.query('Day not in [0, 1]').TARGET
    
    target = test.loc[test.Day == 6].drop(['Day', 'sep_day', 'Minute'], axis = 1)
    
    return a_1day, a_2day, X_1, y_1, X_2, y_2, real_1, real_2, target

In [350]:
a_1day, a_2day, X_1, y_1, X_2, y_2, real_1, real_2, target = make_data()

In [351]:
t_loss = []
qt_list = [.09, .19, .28, .37, .48, .59, .702, .8, .9]

for qt in range(9) :
    print(f'###### {qt + 1}번째 학습 및 검증 시작 ######')
    lgbm = LGBMRegressor(random_state = 519, objective = 'quantile', n_estimators = 10000, learning_rate=0.027, metric = 'quantile', alpha = qt_list[qt], max_depth = 4)
    lgbm.fit(X_1, y_1, eval_set = [(a_1day, real_1)], early_stopping_rounds = 1000, verbose = 0)
    best_loss = float(str(lgbm.best_score_['valid_0']).split(',')[-1][1:-3])
    print(f'best loss는 {best_loss}\n')
    t_loss.append(best_loss)
    lgbm_pred = [0 if x < 0 else x for x in lgbm.predict(target)]
    submission.loc[submission.id.str.contains('y7'), submission.columns[1+ qt]] = lgbm_pred
lgbm_submission = submission.copy()
print(f'전체 loss 평균은 {round(np.mean(t_loss), 5)}')
print(f'전체 loss 편차는 {round(np.std(t_loss), 5)}')

###### 1번째 학습 및 검증 시작 ######
best loss는 1.296308308542097

###### 2번째 학습 및 검증 시작 ######
best loss는 2.225359462801439

###### 3번째 학습 및 검증 시작 ######
best loss는 2.683056715429791

###### 4번째 학습 및 검증 시작 ######
best loss는 2.866210056182411

###### 5번째 학습 및 검증 시작 ######
best loss는 2.818455207478417

###### 6번째 학습 및 검증 시작 ######
best loss는 2.5309774991396257

###### 7번째 학습 및 검증 시작 ######
best loss는 2.0737625275132427

###### 8번째 학습 및 검증 시작 ######
best loss는 1.5964557435597864

###### 9번째 학습 및 검증 시작 ######
best loss는 1.0507466872998945

전체 loss 평균은 2.12681
전체 loss 편차는 0.6359


In [352]:
t_loss = []
qt_list = [.09, .19, .28, .37, .48, .59, .702, .8, .9]

for qt in range(9) :
    print(f'###### {qt + 1}번째 학습 및 검증 시작 ######')
    lgbm = LGBMRegressor(random_state = 519, objective = 'quantile', n_estimators = 10000, learning_rate=0.027, metric = 'quantile', alpha = qt_list[qt], max_depth = 4)
    lgbm.fit(X_2, y_2, eval_set = [(a_2day, real_2)], early_stopping_rounds = 1000, verbose = 0)
    best_loss = float(str(lgbm.best_score_['valid_0']).split(',')[-1][1:-3])
    print(f'best loss는 {best_loss}\n')
    t_loss.append(best_loss)
    lgbm_pred = [0 if x < 0 else x for x in lgbm.predict(target)]
    lgbm_submission.loc[lgbm_submission.id.str.contains('y8'), lgbm_submission.columns[1+ qt]] = lgbm_pred
lgbm_submission = submission.copy()
print(f'전체 loss 평균은 {round(np.mean(t_loss), 5)}')
print(f'전체 loss 편차는 {round(np.std(t_loss), 5)}')

###### 1번째 학습 및 검증 시작 ######
best loss는 1.3520693834328736

###### 2번째 학습 및 검증 시작 ######
best loss는 2.3644907997506266

###### 3번째 학습 및 검증 시작 ######
best loss는 2.8645001845634166

###### 4번째 학습 및 검증 시작 ######
best loss는 3.0593901696180907

###### 5번째 학습 및 검증 시작 ######
best loss는 2.979032697627367

###### 6번째 학습 및 검증 시작 ######
best loss는 2.6709871602913022

###### 7번째 학습 및 검증 시작 ######
best loss는 2.197453213031728

###### 8번째 학습 및 검증 시작 ######
best loss는 1.6758170882178407

###### 9번째 학습 및 검증 시작 ######
best loss는 1.1069548469318813

전체 loss 평균은 2.2523
전체 loss 편차는 0.68311


In [353]:
lgbm_submission.describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0
mean,3.473755,5.949389,7.533428,8.68809,9.690834,10.398359,10.96634,11.41554,11.93241
std,8.136573,13.178814,16.282242,18.600089,20.550173,21.826507,22.917675,23.703502,24.477242
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.119105,0.492141,0.793375,0.996254,1.271076,1.701379,1.708906,1.80771,2.587139
max,49.758318,70.731638,78.711214,86.316575,92.574597,91.845909,97.524431,96.147168,96.965336


In [354]:
lgbm_submission.to_csv('lgbm_baseline.csv', index = False)

### Make Features

In [355]:
tr_times = []
for day in train.Day.unique() :
    d = train.loc[train.Day == day]
    try :
        start = d.loc[d.DNI > 0].index[0]
        end = d.loc[d.DNI > 0].index[-1]
        gap = (end - start) / 2
    except :
        gap = 0

    tr_times.append(gap)

In [356]:
test['sep'] = test.index // 48 # test 데이터의 경우 시계열 고려가 안되었기 때문에 48(하루 시간)으로 나눈 몫으로 day 구분자 생성


te_times = []
for day in test.sep.unique() :
    d = test.loc[test.sep == day]
    try :
        start = d.loc[d.DNI > 0].index[0]
        end = d.loc[d.DNI > 0].index[-1]
        gap = (end - start) / 2
    except :
        gap = 0

    te_times.append(gap)

In [357]:
train_sun_time = pd.DataFrame({'Day' : train.Day.unique(), 'sun_time' : tr_times})
test_sun_time = pd.DataFrame({'sep' : test.sep.unique(), 'sun_time' : te_times})

In [358]:
train = pd.merge(train, train_sun_time, how = 'left', on = 'Day')
test = pd.merge(test, test_sun_time, how = 'left', on = 'sep').drop('sep', axis = 1)

In [359]:
a_1day, a_2day, X_1, y_1, X_2, y_2, real_1, real_2, target = make_data()

In [373]:
t_loss = []
qt_list = [.09, .19, .28, .37, .48, .59, .702, .8, .9]

for qt in range(9) :
    print(f'##### {qt + 1}번째 학습 및 검증 시작 #####')
    lgbm = LGBMRegressor(random_state = 519, objective = 'quantile', n_estimators = 10000, learning_rate=0.027, metric = 'quantile', alpha = qt_list[qt], max_depth = 4)
    lgbm.fit(X_1, y_1, eval_set = [(a_1day, real_1)], early_stopping_rounds = 1000, verbose = 0)
    best_loss = float(str(lgbm.best_score_['valid_0']).split(',')[-1][1:-3])
    print(f'best loss는 {best_loss}\n')
    t_loss.append(best_loss)
    lgbm_pred = [0 if x < 0 else x for x in lgbm.predict(target)]
    submission.loc[submission.id.str.contains('y7'), submission.columns[1+ qt]] = lgbm_pred
lgbm_submission = submission.copy()

print(f'전체 loss 평균은 {round(np.mean(t_loss), 5)}')
print(f'전체 loss 편차는 {round(np.std(t_loss), 5)}')

##### 1번째 학습 및 검증 시작 #####
best loss는 1.3137526113140525

##### 2번째 학습 및 검증 시작 #####
best loss는 2.245721204159991

##### 3번째 학습 및 검증 시작 #####
best loss는 2.717155305173473

##### 4번째 학습 및 검증 시작 #####
best loss는 2.903114023251322

##### 5번째 학습 및 검증 시작 #####
best loss는 2.816610818867936

##### 6번째 학습 및 검증 시작 #####
best loss는 2.479443807925773

##### 7번째 학습 및 검증 시작 #####
best loss는 2.0164447892175286

##### 8번째 학습 및 검증 시작 #####
best loss는 1.5403299774319916

##### 9번째 학습 및 검증 시작 #####
best loss는 1.00165617252876

전체 loss 평균은 2.11491
전체 loss 편차는 0.65382


In [374]:
t_loss = []
qt_list = [.09, .19, .28, .37, .48, .59, .702, .8, .9]

for qt in range(9) :
    print(f'##### {qt + 1}번째 학습 및 검증 시작 #####')
    lgbm = LGBMRegressor(random_state = 519, objective = 'quantile',learning_rate=0.027,  n_estimators = 1000, metric = 'quantile', alpha = qt_list[qt], max_depth = 4)
    lgbm.fit(X_2, y_2, eval_set = [(a_2day, real_2)], early_stopping_rounds = 1000, verbose = 0)
    best_loss = float(str(lgbm.best_score_['valid_0']).split(',')[-1][1:-3])
    print(f'best loss는 {best_loss}\n')
    t_loss.append(best_loss)
    lgbm_pred = [0 if x < 0 else x for x in lgbm.predict(target)]
    lgbm_submission.loc[lgbm_submission.id.str.contains('y8'), lgbm_submission.columns[1+ qt]] = lgbm_pred
lgbm_submission = submission.copy()
print(f'전체 loss 평균은 {round(np.mean(t_loss), 5)}')
print(f'전체 loss 편차는 {round(np.std(t_loss), 5)}')

##### 1번째 학습 및 검증 시작 #####
best loss는 1.3567499646938965

##### 2번째 학습 및 검증 시작 #####
best loss는 2.368558711269472

##### 3번째 학습 및 검증 시작 #####
best loss는 2.874029998366806

##### 4번째 학습 및 검증 시작 #####
best loss는 3.056172091731876

##### 5번째 학습 및 검증 시작 #####
best loss는 2.9661943659583176

##### 6번째 학습 및 검증 시작 #####
best loss는 2.635459605653439

##### 7번째 학습 및 검증 시작 #####
best loss는 2.169600282775306

##### 8번째 학습 및 검증 시작 #####
best loss는 1.6529535515116358

##### 9번째 학습 및 검증 시작 #####
best loss는 1.0762487822632187

전체 loss 평균은 2.23955
전체 loss 편차는 0.68738


In [375]:
lgbm_submission.describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0
mean,6.740263,11.813034,14.931732,17.694354,19.799049,21.24529,22.285437,23.20255,24.406108
std,9.825923,16.210388,19.997879,23.465053,25.895843,27.647803,28.857821,29.73565,30.621683
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.000204,0.000631,0.001098,0.002068
50%,0.190297,0.618613,1.038507,1.603675,1.927387,1.748206,2.204167,2.74229,3.423611
75%,12.135023,22.055097,28.556017,34.797362,39.027873,42.516595,44.551444,46.273874,49.405559
max,53.652994,74.408896,78.625188,82.808364,93.587002,93.856848,96.478742,97.2194,98.492248


In [376]:
lgbm_submission.to_csv('lgbm_일조시간.csv', index = False)

In [378]:
t_loss = []
qt_list = [.09, .19, .28, .37, .48, .59, .702, .8, .9]

for qt in range(9) :
    print(f'##### {qt + 1}번째 학습 및 검증 시작 #####')
    gbm = GradientBoostingRegressor(random_state = 519, loss = 'quantile', alpha = qt_list[qt], max_depth = 4)
    gbm.fit(X_1, y_1)
    gbm_pred = gbm.predict(a_1day)
    loss = pb_loss(real_1, gbm_pred, qt_list[qt])
    print(f'loss는 {loss}\n')
    t_loss.append(loss)
    gbm_pred = [0 if x < 0 else x for x in gbm.predict(target)]
    submission.loc[submission.id.str.contains('y7'), submission.columns[1+ qt]] = gbm_pred
gbm_submission = submission.copy()
print(f'전체 loss 평균은 {round(np.mean(t_loss), 5)}')
print(f'전체 loss 편차는 {round(np.std(t_loss), 5)}')

##### 1번째 학습 및 검증 시작 #####
loss는 1.3192144224615532

##### 2번째 학습 및 검증 시작 #####
loss는 2.2627977531303918

##### 3번째 학습 및 검증 시작 #####
loss는 2.7367220403726638

##### 4번째 학습 및 검증 시작 #####
loss는 2.9244022751270027

##### 5번째 학습 및 검증 시작 #####
loss는 2.8290201636924626

##### 6번째 학습 및 검증 시작 #####
loss는 2.5254748252349866

##### 7번째 학습 및 검증 시작 #####
loss는 2.061312731232972

##### 8번째 학습 및 검증 시작 #####
loss는 1.5824196567983055

##### 9번째 학습 및 검증 시작 #####
loss는 1.041502460192829

전체 loss 평균은 2.14254
전체 loss 편차는 0.6504


In [372]:
t_loss = []
qt_list = [.09, .19, .28, .37, .48, .59, .702, .8, .9]

for qt in range(9) :
    print(f'##### {qt + 1}번째 학습 및 검증 시작 #####')
    gbm = GradientBoostingRegressor(random_state = 519, loss = 'quantile', alpha = qt_list[qt], max_depth = 4)
    gbm.fit(X_2, y_2)
    gbm_pred = gbm.predict(a_2day)
    loss = pb_loss(real_2, gbm_pred, qt_list[qt])
    print(f'loss는 {loss}\n')
    t_loss.append(loss)
    gbm_pred = [0 if x < 0 else x for x in gbm.predict(target)]
    submission.loc[submission.id.str.contains('y8'), submission.columns[1+ qt]] = gbm_pred
gbm_submission = submission.copy()
print(f'전체 loss 평균은 {round(np.mean(t_loss), 5)}')
print(f'전체 loss 편차는 {round(np.std(t_loss), 5)}')

##### 1번째 학습 및 검증 시작 #####
loss는 1.4021321082973661

##### 2번째 학습 및 검증 시작 #####
loss는 2.409493664002174

##### 3번째 학습 및 검증 시작 #####
loss는 2.8875889493232787

##### 4번째 학습 및 검증 시작 #####
loss는 3.064509460817417

##### 5번째 학습 및 검증 시작 #####
loss는 2.996613361336362

##### 6번째 학습 및 검증 시작 #####
loss는 2.665274156233715

##### 7번째 학습 및 검증 시작 #####
loss는 2.1982054347509377

##### 8번째 학습 및 검증 시작 #####
loss는 1.669543044647454

##### 9번째 학습 및 검증 시작 #####
loss는 1.0811805137024992

전체 loss 평균은 2.26384
전체 loss 편차는 0.68704
