# DACON - 태양광 발전량 예측 competition
***

## 1. Package Import

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import GradientBoostingRegressor
import os
from lightgbm import LGBMRegressor
import datetime

***

## 2. Load Data

In [349]:
train = pd.read_csv('/Users/kisehyun/DACON/SUN/train/train.csv')

for i in range(81) :
    globals() [f'test_{i}'] = pd.read_csv(f'/Users/kisehyun/DACON/SUN/test/{i}.csv')
    globals() [f'test_{i}']['sep_day'] = i
    
### 각 데이터를 하나의 데이터로 통합
test = pd.DataFrame()
idx_list = []
for i in range(81):
    test = pd.concat([test, globals()[f'test_{i}']], axis = 0, ignore_index = True)
    
submission = pd.read_csv('sample_submission.csv')

***

## 3. Setting


### Loss Function Definition


In [1]:
def pb_loss(true, pred, q) :
    
    L_list = []
    
    for i in range(len(true)) :
        
        if true.iloc[i] >= pred[i] :
            L = true.iloc[i] - pred[i]
            L *= q
            
        else :
            L1 = pred[i] - true.iloc[i]
            L2 = 1 - q
            L = L1 * L2
            
        L_list.append(L)
        
    return np.mean(L_list)

### custom definition to make dataset
- after 1 day

- after 2 days

In [401]:
def make_data() :
    ### 1일 후 데이터
    train_after_1 = train.loc[train.Day <= 1093]
    train_after_1['TARGET1'] = list(train.loc[train.Day >= 1].TARGET)
    
    ### 2일 후 데이터
    train_after_2 = train.loc[train.Day <= 1092]
    train_after_2['TARGET2'] = list(train.loc[train.Day >= 2].TARGET)
    
    ### after 1 day data 
    a_1day = test.loc[test.Day != 6].drop(['Day', 'Minute','sep_day','DHI', 'RH'], axis = 1)
    
    ### after 2 day data 
    a_2day = test.query('Day not in [5, 6]').drop(['Day', 'Minute','sep_day','DHI', 'RH'], axis = 1)
    
    ### after 1 day train data
    X_1 = train_after_1.drop(['Day','Minute', 'TARGET1','DHI', 'RH'], axis = 1)
    y_1 = train_after_1.TARGET1
    
    ### after 2 days train data
    X_2 = train_after_2.drop(['Day','Minute', 'TARGET2','DHI', 'RH'], axis = 1)
    y_2 = train_after_2.TARGET2
    
    ### actual values
    real_1 = test.loc[test.Day != 0].TARGET
    real_2 = test.query('Day not in [0, 1]').TARGET
    
    target = test.loc[test.Day == 6].drop(['Day', 'sep_day', 'Minute','DHI', 'RH'], axis = 1)
    
    return a_1day, a_2day, X_1, y_1, X_2, y_2, real_1, real_2, target

In [8]:
a_1day, a_2day, X_1, y_1, X_2, y_2, real_1, real_2, target = make_data()

## 4. Modeling

- 4-1. Baseline Modeling

In [9]:
t_loss = []
qt_list = [.09, .19, .28, .37, .48, .59, .702, .8, .9]

for qt in range(9) :
    print(f'###### {qt + 1}번째 학습 및 검증 시작 ######')
    lgbm = LGBMRegressor(random_state = 519, objective = 'quantile', n_estimators = 10000, learning_rate=0.027, metric = 'quantile', alpha = qt_list[qt], max_depth = 4)
    lgbm.fit(X_1, y_1, eval_set = [(a_1day, real_1)], early_stopping_rounds = 1000, verbose = 0)
    best_loss = float(str(lgbm.best_score_['valid_0']).split(',')[-1][1:-3])
    print(f'best loss는 {best_loss}\n')
    t_loss.append(best_loss)
    lgbm_pred = [0 if x < 0 else x for x in lgbm.predict(target)]
    submission.loc[submission.id.str.contains('y7'), submission.columns[1+ qt]] = lgbm_pred
lgbm_submission = submission.copy()
print(f'전체 loss 평균은 {round(np.mean(t_loss), 5)}')
print(f'전체 loss 편차는 {round(np.std(t_loss), 5)}')

###### 1번째 학습 및 검증 시작 ######
best loss는 1.296308308542097

###### 2번째 학습 및 검증 시작 ######
best loss는 2.225359462801439

###### 3번째 학습 및 검증 시작 ######
best loss는 2.683056715429791

###### 4번째 학습 및 검증 시작 ######
best loss는 2.866210056182411

###### 5번째 학습 및 검증 시작 ######
best loss는 2.818455207478417

###### 6번째 학습 및 검증 시작 ######
best loss는 2.5309774991396257

###### 7번째 학습 및 검증 시작 ######
best loss는 2.0737625275132427

###### 8번째 학습 및 검증 시작 ######
best loss는 1.5964557435597864

###### 9번째 학습 및 검증 시작 ######
best loss는 1.0507466872998945

전체 loss 평균은 2.12681
전체 loss 편차는 0.6359


In [10]:
t_loss = []
qt_list = [.09, .19, .28, .37, .48, .59, .702, .8, .9]

for qt in range(9) :
    print(f'###### {qt + 1}번째 학습 및 검증 시작 ######')
    lgbm = LGBMRegressor(random_state = 519, objective = 'quantile', n_estimators = 10000, learning_rate=0.027, metric = 'quantile', alpha = qt_list[qt], max_depth = 4)
    lgbm.fit(X_2, y_2, eval_set = [(a_2day, real_2)], early_stopping_rounds = 1000, verbose = 0)
    best_loss = float(str(lgbm.best_score_['valid_0']).split(',')[-1][1:-3])
    print(f'best loss는 {best_loss}\n')
    t_loss.append(best_loss)
    lgbm_pred = [0 if x < 0 else x for x in lgbm.predict(target)]
    lgbm_submission.loc[lgbm_submission.id.str.contains('y8'), lgbm_submission.columns[1+ qt]] = lgbm_pred
lgbm_submission = submission.copy()
print(f'전체 loss 평균은 {round(np.mean(t_loss), 5)}')
print(f'전체 loss 편차는 {round(np.std(t_loss), 5)}')

###### 1번째 학습 및 검증 시작 ######
best loss는 1.3520693834328736

###### 2번째 학습 및 검증 시작 ######
best loss는 2.3644907997506266

###### 3번째 학습 및 검증 시작 ######
best loss는 2.8645001845634166

###### 4번째 학습 및 검증 시작 ######
best loss는 3.0593901696180907

###### 5번째 학습 및 검증 시작 ######
best loss는 2.979032697627367

###### 6번째 학습 및 검증 시작 ######
best loss는 2.6709871602913022

###### 7번째 학습 및 검증 시작 ######
best loss는 2.197453213031728

###### 8번째 학습 및 검증 시작 ######
best loss는 1.6758170882178407

###### 9번째 학습 및 검증 시작 ######
best loss는 1.1069548469318813

전체 loss 평균은 2.2523
전체 loss 편차는 0.68311


In [11]:
lgbm_submission.describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0
mean,3.473755,5.949389,7.533428,8.68809,9.690834,10.398359,10.96634,11.41554,11.93241
std,8.136573,13.178814,16.282242,18.600089,20.550173,21.826507,22.917675,23.703502,24.477242
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.119105,0.492141,0.793375,0.996254,1.271076,1.701379,1.708906,1.80771,2.587139
max,49.758318,70.731638,78.711214,86.316575,92.574597,91.845909,97.524431,96.147168,96.965336


In [None]:
lgbm_submission.to_csv('lgbm_baseline.csv', index = False)

#### it's score was bad. so, i made some features to get better score.

1. 일조시간

In [350]:
tr_times = []
for day in train.Day.unique() :
    d = train.loc[train.Day == day]
    try :
        start = d.loc[d.DHI > 0].index[0]
        end = d.loc[d.DHI > 0].index[-1]
        gap = (end - start) / 2
    except :
        gap = 0

    tr_times.append(gap)

In [351]:
test['sep'] = test.index // 48 # test 데이터의 경우 시계열 고려가 안되었기 때문에 48(하루 시간)으로 나눈 몫으로 day 구분자 생성


te_times = []
for day in test.sep.unique() :
    d = test.loc[test.sep == day]
    try :
        start = d.loc[d.DHI > 0].index[0]
        end = d.loc[d.DHI > 0].index[-1]
        gap = (end - start) / 2
    except :
        gap = 0

    te_times.append(gap)

In [352]:
train_sun_time = pd.DataFrame({'Day' : train.Day.unique(), 'sun_time' : tr_times})
test_sun_time = pd.DataFrame({'sep' : test.sep.unique(), 'sun_time' : te_times})

In [353]:
train = pd.merge(train, train_sun_time, how = 'left', on = 'Day')
test = pd.merge(test, test_sun_time, how = 'left', on = 'sep')#.drop('sep', axis = 1)

2. GHI

In [354]:
def get_d(x) :
    # 평균 기온 기준으로 계절별 평균 남중 고도 적용
    if x < 6.8 :
        return 29.5
    elif 6.8 <= x <= 20 :
        return 53
    else :
        return 76.5

In [355]:
tr_degree = train.groupby('Day')['T'].mean().reset_index(name = 'degree')
tr_degree.index.name = None
tr_degree.head()

Unnamed: 0,Day,degree
0,0,-7.979167
1,1,-6.3125
2,2,-6.479167
3,3,-5.6875
4,4,0.854167


In [356]:
te_degree = test.groupby('sep')['T'].mean().reset_index(name = 'degree')
te_degree.index.name = None
te_degree.head()

Unnamed: 0,sep,degree
0,0,4.652083
1,1,5.45
2,2,3.414583
3,3,0.5125
4,4,-2.258333


In [357]:
tr_degree['degree'] = tr_degree.degree.apply(get_d)
te_degree['degree'] = te_degree.degree.apply(get_d)

In [358]:
train = pd.merge(train, tr_degree, how = 'left', on = 'Day')
test = pd.merge(test, te_degree, how = 'left', on = 'sep')

In [359]:
train['GHI'] = train.DNI * train.degree + train.DHI
test['GHI'] = test.DNI * test.degree + test.DHI

In [360]:
train.drop('degree', axis = 1, inplace = True)
test.drop(['degree', 'sep'], axis = 1, inplace = True)

I tried Log transformation but it was not good.

In [344]:
#train.DHI = np.log1p(train.DHI)
#test.DHI = np.log1p(test.DHI)

In [345]:
#train.DNI = np.log1p(train.DNI)
#test.DNI = np.log1p(test.DNI)

In [346]:
#train.WS = np.log1p(train.WS)
#test.WS = np.log1p(test.WS)

In [406]:
a_1day, a_2day, X_1, y_1, X_2, y_2, real_1, real_2, target = make_data()

- 4-2. 2nd Modeling

#### LGBMRegressor

In [407]:
t_loss = []
qt_list = [.09, .19, .28, .37, .48, .59, .702, .8, .9]

for qt in range(9) :
    print(f'##### {qt + 1}번째 학습 및 검증 시작 #####')
    lgbm = LGBMRegressor(random_state = 519, objective = 'quantile', n_estimators = 1000, learning_rate=0.03, metric = 'quantile', alpha = qt_list[qt], max_depth = 4)
    lgbm.fit(X_1, y_1, eval_set = [(a_1day, real_1)], early_stopping_rounds = 1000, verbose = 0)
    best_loss = float(str(lgbm.best_score_['valid_0']).split(',')[-1][1:-3])
    print(f'best loss는 {best_loss}\n')
    t_loss.append(best_loss)
    lgbm_pred = [0 if x < 0 else x for x in lgbm.predict(target)]
    submission.loc[submission.id.str.contains('y7'), submission.columns[1+ qt]] = lgbm_pred
lgbm_submission = submission.copy()

print(f'전체 loss 평균은 {round(np.mean(t_loss), 5)}')
print(f'전체 loss 편차는 {round(np.std(t_loss), 5)}')

##### 1번째 학습 및 검증 시작 #####
best loss는 1.3063246587046102

##### 2번째 학습 및 검증 시작 #####
best loss는 2.2302982101149595

##### 3번째 학습 및 검증 시작 #####
best loss는 2.692045557106718

##### 4번째 학습 및 검증 시작 #####
best loss는 2.8564936274637764

##### 5번째 학습 및 검증 시작 #####
best loss는 2.7791290927927563

##### 6번째 학습 및 검증 시작 #####
best loss는 2.4579022824048202

##### 7번째 학습 및 검증 시작 #####
best loss는 1.9746674848346708

##### 8번째 학습 및 검증 시작 #####
best loss는 1.4581315542925963

##### 9번째 학습 및 검증 시작 #####
best loss는 0.8504881798633562

전체 loss 평균은 2.06728
전체 loss 편차는 0.6781


In [408]:
lgbm_submission.loc[lgbm_submission.id.str.contains('y7')].describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0
mean,6.671701,11.871896,15.061467,17.616401,19.917502,21.4124,22.65363,23.37348,24.33536
std,10.066146,16.619501,20.377033,23.522,26.076597,27.84958,29.25075,30.10668,30.67415
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,8.623251e-13,1.403407e-12,3.078631e-12,4.35488e-12
50%,0.17244,0.68268,0.946676,1.280808,1.559619,1.814575,2.536625,2.675686,4.410089
75%,11.650431,20.991169,28.464645,34.201856,39.531262,43.19174,45.99694,46.80014,48.22728
max,53.742815,66.079905,75.788528,89.005527,90.874297,92.99441,94.30983,96.49534,98.5925


In [409]:
t_loss = []
qt_list = [.09, .19, .28, .37, .48, .59, .702, .8, .9]

for qt in range(9) :
    print(f'##### {qt + 1}번째 학습 및 검증 시작 #####')
    lgbm = LGBMRegressor(random_state = 519, objective = 'quantile', n_estimators = 1000, learning_rate=0.03, metric = 'quantile', alpha = qt_list[qt], max_depth = 4)
    lgbm.fit(X_2, y_2, eval_set = [(a_2day, real_2)], early_stopping_rounds = 1000, verbose = 0)
    best_loss = float(str(lgbm.best_score_['valid_0']).split(',')[-1][1:-3])
    print(f'best loss는 {best_loss}\n')
    t_loss.append(best_loss)
    lgbm_pred = [0 if x < 0 else x for x in lgbm.predict(target)]
    submission.loc[submission.id.str.contains('y8'), submission.columns[1+ qt]] = lgbm_pred
lgbm_submission = submission.copy()

print(f'전체 loss 평균은 {round(np.mean(t_loss), 5)}')
print(f'전체 loss 편차는 {round(np.std(t_loss), 5)}')

##### 1번째 학습 및 검증 시작 #####
best loss는 1.367070734496164

##### 2번째 학습 및 검증 시작 #####
best loss는 2.3812752019465666

##### 3번째 학습 및 검증 시작 #####
best loss는 2.869918320266945

##### 4번째 학습 및 검증 시작 #####
best loss는 3.0352618558329807

##### 5번째 학습 및 검증 시작 #####
best loss는 2.9296138938861325

##### 6번째 학습 및 검증 시작 #####
best loss는 2.564403764143778

##### 7번째 학습 및 검증 시작 #####
best loss는 2.094579895822889

##### 8번째 학습 및 검증 시작 #####
best loss는 1.5419708136652193

##### 9번째 학습 및 검증 시작 #####
best loss는 0.9075410630905644

전체 loss 평균은 2.18796
전체 loss 편차는 0.71891


In [410]:
lgbm_submission.loc[lgbm_submission.id.str.contains('y8')].describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0
mean,5.240626,11.177897,14.8153,17.61662,20.196734,21.87942,22.94041,23.70545,24.6246
std,7.317789,15.221067,19.683597,23.166726,26.297704,28.3309,29.65127,30.46075,30.82637
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,5.138109e-13,1.491603e-12,2.412947e-12,3.974722e-12
50%,0.112862,0.502697,1.0372,1.365043,1.333066,1.571111,2.545459,2.864279,5.142566
75%,9.942574,21.338157,29.406973,35.911892,41.227578,44.16987,46.29071,46.59214,48.65305
max,25.594997,62.873798,85.15696,93.107017,97.199262,97.45169,97.6187,96.99815,98.02837


#### GradientBoostingRegressor

In [388]:
t_loss = []
qt_list = [.09, .19, .28, .37, .48, .59, .702, .8, .9]

for qt in range(9) :
    print(f'##### {qt + 1}번째 학습 및 검증 시작 #####')
    gbm = GradientBoostingRegressor(random_state = 1231, loss = 'quantile', alpha = qt_list[qt], max_depth = 4)
    gbm.fit(X_1, y_1)
    gbm_pred = gbm.predict(a_1day)
    loss = pb_loss(real_1, gbm_pred, qt_list[qt])
    print(f'loss는 {loss}\n')
    t_loss.append(loss)
    gbm_pred = [0 if x < 0 else x for x in gbm.predict(target)]
    submission.loc[submission.id.str.contains('y7'), submission.columns[1+ qt]] = gbm_pred
gbm_submission = submission.copy()
print(f'전체 loss 평균은 {round(np.mean(t_loss), 5)}')
print(f'전체 loss 편차는 {round(np.std(t_loss), 5)}')

##### 1번째 학습 및 검증 시작 #####
loss는 1.3077501520177364

##### 2번째 학습 및 검증 시작 #####
loss는 2.240453783151971

##### 3번째 학습 및 검증 시작 #####
loss는 2.7038913534451776

##### 4번째 학습 및 검증 시작 #####
loss는 2.87953239537066

##### 5번째 학습 및 검증 시작 #####
loss는 2.8105330482094386

##### 6번째 학습 및 검증 시작 #####
loss는 2.4989654754515302

##### 7번째 학습 및 검증 시작 #####
loss는 2.0259229526608213

##### 8번째 학습 및 검증 시작 #####
loss는 1.5184346967527529

##### 9번째 학습 및 검증 시작 #####
loss는 0.9243956822071292

전체 loss 평균은 2.1011
전체 loss 편차는 0.66746


In [390]:
t_loss = []
qt_list = [.09, .19, .28, .37, .48, .59, .702, .8, .9]

for qt in range(9) :
    print(f'##### {qt + 1}번째 학습 및 검증 시작 #####')
    gbm = GradientBoostingRegressor(random_state = 1231, loss = 'quantile', alpha = qt_list[qt], max_depth = 4)
    gbm.fit(X_2, y_2)
    gbm_pred = gbm.predict(a_2day)
    loss = pb_loss(real_2, gbm_pred, qt_list[qt])
    print(f'loss는 {loss}\n')
    t_loss.append(loss)
    gbm_pred = [0 if x < 0 else x for x in gbm.predict(target)]
    submission.loc[submission.id.str.contains('y8'), submission.columns[1+ qt]] = gbm_pred
gbm_submission = submission.copy()
print(f'전체 loss 평균은 {round(np.mean(t_loss), 5)}')
print(f'전체 loss 편차는 {round(np.std(t_loss), 5)}')

##### 1번째 학습 및 검증 시작 #####
loss는 1.3747640146695435

##### 2번째 학습 및 검증 시작 #####
loss는 2.385445260579046

##### 3번째 학습 및 검증 시작 #####
loss는 2.8749078485044417

##### 4번째 학습 및 검증 시작 #####
loss는 3.053864587659912

##### 5번째 학습 및 검증 시작 #####
loss는 2.9649630641206786

##### 6번째 학습 및 검증 시작 #####
loss는 2.6238076321761765

##### 7번째 학습 및 검증 시작 #####
loss는 2.142435794070694

##### 8번째 학습 및 검증 시작 #####
loss는 1.6006536419625252

##### 9번째 학습 및 검증 시작 #####
loss는 0.9636419268060754

전체 loss 평균은 2.2205
전체 loss 편차는 0.71119


In [391]:
gbm_submission.describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0
mean,6.335922,11.514949,14.905705,17.446463,19.811836,21.44536,22.725336,23.522476,24.818246
std,9.385291,16.032742,20.043526,23.070594,25.785543,27.773939,29.23034,30.117441,30.717524
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.000204,0.000631,0.001084,0.00163
50%,0.148936,0.475107,1.054065,1.300678,1.790515,2.021285,2.931271,2.638427,5.500229
75%,11.301639,21.112824,28.804406,34.150701,39.684708,43.023884,45.802351,46.291108,48.841
max,52.848297,84.035331,90.216481,87.390825,91.923439,94.654569,96.071678,97.149824,98.524096


## 5. Submission

In [392]:
submission.iloc[:, 1:] = lgbm_submission.iloc[:, 1:] * .8 + gbm_submission.iloc[:, 1:] * .2

In [393]:
zero_t = submission.loc[submission.id.str.contains('_0h|_1h|_2h|_3h|_4h|_21h|_22h|_23h')]
zero_t.iloc[:, 1:] = 0

In [394]:
submission.loc[zero_t.index] = zero_t

In [395]:
submission.describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0
mean,6.030868,11.521225,14.929614,17.580602,20.005611,21.602848,22.772212,23.526283,24.50194
std,8.921884,15.94674,20.028897,23.279736,26.099277,28.020582,29.406968,30.250258,30.758115
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.124645,0.540194,0.995598,1.288874,1.486045,1.72715,2.539565,2.682867,4.443404
75%,10.866396,21.191734,28.886014,34.881629,40.197827,43.702966,46.058759,46.68256,48.476355
max,53.290696,67.014343,85.42123,91.84835,96.144097,96.892266,97.309292,96.759488,98.378237


In [396]:
save_time = datetime.datetime.strftime(datetime.datetime.today(),'%Y%m%d_%H%M%S')

submission.to_csv(f'sun_{save_time}.csv', index=False)