# Solar Energy Generation Prediction

In [382]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import GradientBoostingRegressor
import os
from lightgbm import LGBMRegressor
import datetime
from itertools import combinations
from sklearn.model_selection import KFold, TimeSeriesSplit
from itertools import chain, repeat
from tqdm.notebook import tqdm
import math

In [383]:
train = pd.read_csv('/Users/kisehyun/Competition/데이콘_태양광 발전량 예측/train/train.csv')

for i in range(81) :
    globals() [f'test_{i}'] = pd.read_csv(f'/Users/kisehyun/Competition/데이콘_태양광 발전량 예측/test/{i}.csv')
    globals() [f'test_{i}']['sep_day'] = i
    
### 각 데이터를 하나의 데이터로 통합
test = pd.DataFrame()
idx_list = []
for i in range(81):
    test = pd.concat([test, globals()[f'test_{i}']], axis = 0, ignore_index = True)
    
submission = pd.read_csv('sample_submission.csv')

In [384]:
train.Hour = train.Hour.astype('category')
test.Hour = test.Hour.astype('category')

In [385]:
train['rainfall'] = [1 if x == 100 else 0 for x in train.RH]
test['rainfall'] = [1 if x == 100 else 0 for x in test.RH]

train.rainfall = train.rainfall.astype('category')
test.rainfall = test.rainfall.astype('category')

In [386]:
def pb_loss(true, pred, q) :
    
    L_list = []
    
    for i in range(len(true)) :
        
        if true.iloc[i] >= pred[i] :
            L = true.iloc[i] - pred[i]
            L *= q
            
        else :
            L1 = pred[i] - true.iloc[i]
            L2 = 1 - q
            L = L1 * L2
            
        L_list.append(L)
        
    return np.mean(L_list)

In [387]:
def make_data(data, n1, n2, is_train = None) : 
    
    data['after_1d'] = data.shift(n1)['TARGET']
    data['after_2d'] = data.shift(n2)['TARGET']
    if is_train == True :
        data.dropna(inplace = True)
    else :
        pass
    return(data)

In [388]:
df_train = make_data(train, -48, -96, is_train = True) # 1일, 2일 뒤 TARGET 
df_test = test.copy()

In [389]:
b = 17.62
c = 243.12

gamma = (b * df_train['T'] / (c + df_train['T'])) + np.log(df_train.RH / 100)
df_train['rh_d'] = (c * gamma) / (b - gamma)

gamma = (b * df_test['T'] / (c + df_test['T'])) + np.log(df_test.RH / 100)
df_test['rh_d'] = (c * gamma) / (b - gamma)

In [390]:
df_train['vp'] = 6.11 * 10 ** (7.5 * df_train['T'] / (df_train['T'] + 237.3))
df_test['vp'] = 6.11 * 10 ** (7.5 * test['T'] / (test['T'] + 237.3))

In [391]:
df_test['sep'] = test.index // 48 + 1

In [392]:
tr_times = []
for day in df_train.Day.unique() :
    d = df_train.loc[df_train.Day == day]
    try :
        start = d.loc[d.TARGET > 0].index[0]
        end = d.loc[d.TARGET > 0].index[-1]
        gap = (end - start) / 2
    except :
        gap = 0

    tr_times.append(gap)

In [393]:
df_test['sep'] = df_test.index // 48 # df_df_test 데이터의 경우 시계열 고려가 안되었기 때문에 48(하루 시간)으로 나눈 몫으로 day 구분자 생성


te_times = []
for day in df_test.sep.unique() :
    d = df_test.loc[df_test.sep == day]
    try :
        start = d.loc[d.TARGET > 0].index[0]
        end = d.loc[d.TARGET > 0].index[-1]
        gap = (end - start) / 2
    except :
        gap = 0

    te_times.append(gap)

In [394]:
df_train_sun_time = pd.DataFrame({'Day' : df_train.Day.unique(), 'sun_time' : tr_times})
df_test_sun_time = pd.DataFrame({'sep' : df_test.sep.unique(), 'sun_time' : te_times})

In [395]:
df_train = pd.merge(df_train, df_train_sun_time, how = 'left', on = 'Day')
df_test = pd.merge(df_test, df_test_sun_time, how = 'left', on = 'sep')#.drop('sep', axis = 1)

sin h = (sin δ * sin φ) + (cos δ * cos φ * cos H)

In [396]:
# 적위
df_train['dos'] = [-23.44 * np.cos(360 / 365 * (x + 10)) for x in df_train.Day]
df_test['dos'] = [-23.44 * np.cos(360 / 365 * (x + 10)) for x in df_test.sep_day]

In [397]:
# 시간각
df_train['high_angle'] = [(x - 12) * 15 if x >= 12 else -(12 - x) * 15 for x in df_train.Hour]
df_test['high_angle'] = [(x - 12) * 15 if x >= 12 else -(12 - x) * 15 for x in df_test.Hour]

In [398]:
df_train['zenith_angle'] = 90 - 1 / (np.sin(np.sin(df_train['dos']) * np.sin(36) + np.cos(df_train['dos']) *np.cos(36) * np.cos(df_train['high_angle'])))
df_test['zenith_angle'] = 90 - 1 / (np.sin(np.sin(df_test['dos']) * np.sin(36) + np.cos(df_test['dos']) *np.cos(36) * np.cos(df_test['high_angle'])))

In [399]:
df_train['GHI'] = np.cos(df_train.zenith_angle) * df_train.DNI + df_train.DHI
df_test['GHI'] = np.cos(df_test.zenith_angle) * df_test.DNI + df_test.DHI

In [404]:
def kfold_lgbm_prediction(trainx, trainy, target, n) :
    lgbm_submission = pd.DataFrame(np.zeros((3888, 9)), columns = submission.columns[1:],
                                   index = submission.loc[submission.id.str.contains(f'y{n}')].index)
    
    kf = KFold(n_splits = 5, shuffle = True, random_state = 114)

    
    quantile = [.09, .19, .28, .37, .48, .59, .702, .8, .9]
    
    total_loss = []
    total_val_loss = []
    
    for i, (tr_idx, val_idx) in enumerate(kf.split(trainx, trainy)) :

        print('=' * 30)
        print(f'        {i + 1}번째 KFOLD 시작')

        nfold_loss = []
        nfold_val_loss = []

        tr_X, tr_y = trainx.iloc[tr_idx], trainy.iloc[tr_idx]
        val_X, val_y = trainx.iloc[val_idx], trainy.iloc[val_idx]
        
        p = 0
        for q in quantile:
            p += 1
            lgbm = LGBMRegressor(random_state = 114, objective = 'quantile', metric = 'quantile', alpha = q, max_depth = 5)
            lgbm.fit(tr_X, tr_y, eval_set = [(val_X, val_y)], eval_metric = 'quantile', early_stopping_rounds = 500, verbose = 0)
            best_loss = float(str(lgbm.best_score_['valid_0']).split(',')[-1][1:-3])
            print(f'{q} quantile loss = {best_loss}')
            nfold_loss.append(best_loss)
            
            total_loss.append(best_loss)
            
            lgbm_pred = [0 if x < 0 else x for x in lgbm.predict(target) / 5]
            lgbm_submission.iloc[:, p - 1] += lgbm_pred
        print(f'{i + 1}번째 KFOLD 평균 loss는 {np.mean(nfold_loss)}')

    print(f'\n전체 평균 loss는 {np.mean(total_loss)}')
    return lgbm_submission

In [405]:
X = df_train.drop(['Day','Minute', 'DHI', 'RH', 'WS', 'after_1d', 'after_2d'], axis = 1)
y1 = df_train['after_1d']
y2 = df_train['after_2d']
target = df_test.loc[df_test.Day == 6, X.columns]

In [406]:
lgbm7 = kfold_lgbm_prediction(X, y1, target, 7)

        1번째 KFOLD 시작
0.09 quantile loss = 1.2408554583650921
0.19 quantile loss = 2.081498570109854
0.28 quantile loss = 2.474417478168031
0.37 quantile loss = 2.6335225503128226
0.48 quantile loss = 2.589231708347374
0.59 quantile loss = 2.3070683740695808
0.702 quantile loss = 1.8534705953817074
0.8 quantile loss = 1.3809208364633037
0.9 quantile loss = 0.7736684191483697
1번째 KFOLD 평균 loss는 1.9260726655962368
        2번째 KFOLD 시작
0.09 quantile loss = 1.2269804496789525
0.19 quantile loss = 2.054898206855775
0.28 quantile loss = 2.4426219469639627
0.37 quantile loss = 2.622212554260185
0.48 quantile loss = 2.590822419801225
0.59 quantile loss = 2.333713929816436
0.702 quantile loss = 1.86736590531883
0.8 quantile loss = 1.371437457812523
0.9 quantile loss = 0.7775085874032757
2번째 KFOLD 평균 loss는 1.9208401619901296
        3번째 KFOLD 시작
0.09 quantile loss = 1.251721056323598
0.19 quantile loss = 2.1010722759002975
0.28 quantile loss = 2.521589071423929
0.37 quantile loss = 2.669460776442

In [407]:
lgbm7.describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0
mean,7.383903,12.488317,15.507239,17.690991,19.913967,21.548179,22.844537,24.030285,25.481812
std,10.581362,17.197816,20.9841,23.587881,26.06404,28.003682,29.469555,30.509055,31.599727
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001085,0.001853
25%,0.0,0.0,0.0,0.0,0.0,0.000203,0.00063,0.001085,0.001853
50%,0.373151,0.75587,1.094405,1.403384,1.798063,2.037886,2.567992,3.560428,4.910135
75%,13.377438,23.148404,29.346605,34.167625,39.382718,43.183939,46.070374,48.502034,51.821122
max,51.671205,65.663506,77.320666,83.617651,88.583355,92.35011,94.867285,95.567873,97.080055


In [408]:
lgbm8 = kfold_lgbm_prediction(X, y2, target, 8)

        1번째 KFOLD 시작
0.09 quantile loss = 1.246248865685785
0.19 quantile loss = 2.131181800112783
0.28 quantile loss = 2.5774295088228367
0.37 quantile loss = 2.782855105827997
0.48 quantile loss = 2.7113123537866786
0.59 quantile loss = 2.429346099672056
0.702 quantile loss = 1.9695119019203848
0.8 quantile loss = 1.4497714353484932
0.9 quantile loss = 0.8043459840142859
1번째 KFOLD 평균 loss는 2.0113336727990334
        2번째 KFOLD 시작
0.09 quantile loss = 1.2550080528755674
0.19 quantile loss = 2.130717355609239
0.28 quantile loss = 2.573773014535428
0.37 quantile loss = 2.758998669242466
0.48 quantile loss = 2.731352848953171
0.59 quantile loss = 2.454634304333436
0.702 quantile loss = 1.9621749124391108
0.8 quantile loss = 1.4431760663168967
0.9 quantile loss = 0.8068747371204854
2번째 KFOLD 평균 loss는 2.0129677734917557
        3번째 KFOLD 시작
0.09 quantile loss = 1.285636942685336
0.19 quantile loss = 2.196802228341808
0.28 quantile loss = 2.651703065663615
0.37 quantile loss = 2.838494831273

In [409]:
lgbm8.describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0
mean,6.704957,11.968263,15.337114,17.965763,20.36673,22.078626,23.384826,24.564578,25.729699
std,9.359304,16.168969,20.335368,23.54963,26.436551,28.651058,30.16544,31.141906,31.832449
min,0.0,0.0,0.0,0.0,0.0,0.0,0.00063,0.001084,0.00183
25%,0.0,0.0,0.0,0.0,0.0,0.000203,0.00063,0.001084,0.00183
50%,0.333073,0.689778,1.029788,1.450391,1.881676,1.953209,2.59167,3.65414,5.379897
75%,12.743579,23.202345,30.655141,35.947132,41.192811,44.433936,46.863404,49.439989,52.362066
max,39.515362,70.166436,77.985925,81.515409,85.978025,92.048038,96.787724,96.92206,97.904492


In [414]:
def kfold_gb_prediction(trainx, trainy, target, n) :
    gb_submission = pd.DataFrame(np.zeros((3888, 9)), columns = submission.columns[1:],
                                   index = submission.loc[submission.id.str.contains(f'y{n}')].index)
    
    kf = KFold(n_splits = 5, shuffle = True, random_state = 114)

    
    quantile = [.09, .19, .28, .37, .48, .59, .702, .8, .9]
    
    total_loss = []
    total_val_loss = []
    
    for i, (tr_idx, val_idx) in enumerate(kf.split(trainx, trainy)) :

        print('=' * 30)
        print(f'        {i + 1}번째 KFOLD 시작')

        nfold_loss = []
        nfold_val_loss = []

        tr_X, tr_y = trainx.iloc[tr_idx], trainy.iloc[tr_idx]
        val_X, val_y = trainx.iloc[val_idx], trainy.iloc[val_idx]
        
        p = 0
        for q in quantile:
            p += 1
            gb = GradientBoostingRegressor(random_state = 114, loss = 'quantile', alpha = q, max_depth = 5)
            gb.fit(tr_X, tr_y)
            pred = gb.predict(val_X)
            best_loss = pb_loss(val_y, pred, q)
            print(f'{q} quantile loss = {best_loss}')
            nfold_loss.append(best_loss)
            
            total_loss.append(best_loss)
            
            gb_pred = [0 if x < 0 else x for x in gb.predict(target) / 5]
            gb_submission.iloc[:, p - 1] += gb_pred
        print(f'{i + 1}번째 KFOLD 평균 loss는 {np.mean(nfold_loss)}')

    print(f'전체 평균 loss는 {np.mean(total_loss)}')
    return gb_submission

In [415]:
gb7 = kfold_gb_prediction(X, y1, target, 7)

        1번째 KFOLD 시작
0.09 quantile loss = 1.2536774353615585
0.19 quantile loss = 2.085299938592596
0.28 quantile loss = 2.472886299047963
0.37 quantile loss = 2.645550264373355
0.48 quantile loss = 2.5878196127322086
0.59 quantile loss = 2.3327207255549065
0.702 quantile loss = 1.8872500548993043
0.8 quantile loss = 1.384074359342526
0.9 quantile loss = 0.7851315659455534
1번째 KFOLD 평균 loss는 1.9371566950944414
        2번째 KFOLD 시작
0.09 quantile loss = 1.2583646875105876
0.19 quantile loss = 2.0764871808695466
0.28 quantile loss = 2.461858841135225
0.37 quantile loss = 2.650337348524398
0.48 quantile loss = 2.613291919281131
0.59 quantile loss = 2.3609291223181974
0.702 quantile loss = 1.89724995493887
0.8 quantile loss = 1.3861140054367047
0.9 quantile loss = 0.7990690093085426
2번째 KFOLD 평균 loss는 1.9448557854803559
        3번째 KFOLD 시작
0.09 quantile loss = 1.272564628188334
0.19 quantile loss = 2.115907473160399
0.28 quantile loss = 2.5284516286320855
0.37 quantile loss = 2.70939622729

In [416]:
gb7.describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0
mean,6.850085,12.039928,15.074511,17.421482,19.571713,21.213387,22.376339,23.385513,24.644652
std,9.762055,16.546899,20.420253,23.225668,25.702945,27.655589,28.988694,29.861888,30.581873
min,0.0,0.0,0.0,0.0,0.0,0.0,0.000504,0.001085,0.001454
25%,0.0,0.0,0.0,0.0,0.0,0.000203,0.00063,0.001085,0.024982
50%,0.37448,0.729088,1.006616,1.392201,1.676131,2.068385,2.520642,3.441036,5.486739
75%,12.471416,21.890322,28.151802,33.471476,38.582389,42.801154,44.442424,46.555982,49.007226
max,47.890251,62.091996,75.400413,81.038544,87.414525,90.862096,93.742638,95.344668,96.405524


In [417]:
gb8 = kfold_gb_prediction(X, y2, target, 8)

        1번째 KFOLD 시작
0.09 quantile loss = 1.2677567241284688
0.19 quantile loss = 2.1679683638835963
0.28 quantile loss = 2.6113589016872103
0.37 quantile loss = 2.791106536126418
0.48 quantile loss = 2.739735037426501
0.59 quantile loss = 2.4584310238106886
0.702 quantile loss = 1.9834399209948164
0.8 quantile loss = 1.4545575246831153
0.9 quantile loss = 0.8252085578313323
1번째 KFOLD 평균 loss는 2.033284732285794
        2번째 KFOLD 시작
0.09 quantile loss = 1.272054876452408
0.19 quantile loss = 2.1698286192912573
0.28 quantile loss = 2.615848620363
0.37 quantile loss = 2.7980526258099867
0.48 quantile loss = 2.7506811589622817
0.59 quantile loss = 2.4538841744751947
0.702 quantile loss = 1.983121238294539
0.8 quantile loss = 1.446873736698353
0.9 quantile loss = 0.8180900961557885
2번째 KFOLD 평균 loss는 2.0342705718336456
        3번째 KFOLD 시작
0.09 quantile loss = 1.313433116490525
0.19 quantile loss = 2.232283388521042
0.28 quantile loss = 2.6761010626463837
0.37 quantile loss = 2.861337702921

In [418]:
gb8.describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0
mean,6.100035,11.327073,14.854631,17.484043,19.915861,21.539918,22.732707,23.678941,24.930509
std,8.3996,15.150717,19.634915,22.889013,25.862462,27.973073,29.334562,30.153629,30.784903
min,0.0,0.0,0.0,0.0,0.0,0.000203,0.00063,0.001084,0.00121
25%,0.0,0.0,0.0,0.0,0.0,0.000203,0.00063,0.001084,0.02012
50%,0.229303,0.647852,1.181871,1.561115,2.002067,2.162515,2.614363,3.770801,5.84668
75%,11.567607,22.122552,29.611817,35.209484,40.217151,43.3197,45.472078,46.63444,49.375641
max,31.978671,58.168114,69.938865,76.580296,84.510558,92.228101,94.986842,96.268382,96.534592


In [419]:
submission.loc[submission.id.str.contains('y7'), submission.columns[1:]] = lgbm7 * .6 + gb7 * .4
submission.loc[submission.id.str.contains('y8'), submission.columns[1:]] = lgbm8 * .7 + gb8 * .3

In [420]:
submission.loc[submission.id.str.contains('_0h|_1h|_2h|_3h|_4h|20h|21h|22h|23h'), submission.columns[1:]] = 0

In [421]:
submission.describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0
mean,6.84321,12.035924,15.253489,17.691248,19.99083,21.653988,22.913723,24.019642,25.29039
std,9.667841,16.398367,20.431927,23.391769,26.088867,28.152849,29.587875,30.534757,31.342747
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.318213,0.700802,1.055926,1.441081,1.800563,1.995584,2.588545,3.403086,4.866488
75%,12.689828,22.605776,29.712476,34.89223,39.975393,43.5289,45.824446,48.031452,51.001846
max,49.182281,66.506196,76.494094,82.372939,87.837835,92.102057,95.661796,95.872497,96.498077


In [422]:
time_str = datetime.datetime.strftime(datetime.datetime.today(),'%Y%m%d_%H%M%S')
submission.to_csv(f'sun_{time_str}.csv', index=False)

# LGBM만 했을 때 1.86199