# Solar Energy Generation Prediction

In [271]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import GradientBoostingRegressor
import os
from lightgbm import LGBMRegressor
import datetime
from itertools import combinations
from sklearn.model_selection import KFold, TimeSeriesSplit
from itertools import chain, repeat
from tqdm.notebook import tqdm

In [272]:
train = pd.read_csv('/Users/kisehyun/Competition/데이콘_태양광 발전량 예측/train/train.csv')

for i in range(81) :
    globals() [f'test_{i}'] = pd.read_csv(f'/Users/kisehyun/Competition/데이콘_태양광 발전량 예측/test/{i}.csv')
    globals() [f'test_{i}']['sep_day'] = i
    
### 각 데이터를 하나의 데이터로 통합
test = pd.DataFrame()
idx_list = []
for i in range(81):
    test = pd.concat([test, globals()[f'test_{i}']], axis = 0, ignore_index = True)
    
submission = pd.read_csv('sample_submission.csv')

In [273]:
train.Hour = train.Hour.astype('category')
test.Hour = test.Hour.astype('category')

In [274]:
train['rainfall'] = [1 if x == 100 else 0 for x in train.RH]
test['rainfall'] = [1 if x == 100 else 0 for x in test.RH]

In [275]:
def pb_loss(true, pred, q) :
    
    L_list = []
    
    for i in range(len(true)) :
        
        if true.iloc[i] >= pred[i] :
            L = true.iloc[i] - pred[i]
            L *= q
            
        else :
            L1 = pred[i] - true.iloc[i]
            L2 = 1 - q
            L = L1 * L2
            
        L_list.append(L)
        
    return np.mean(L_list)

In [276]:
def base_preprocess(data, n, n2) : 
    
    data['1day_after_target'] = data.shift(n)['TARGET']
    data['2day_after_target'] = data.shift(n2)['TARGET']
    
    # step2 : 7일간의 segment를 할당하여 예측을 진행 
    
    data = data.dropna(axis=0)
    
    return(data)

In [277]:
b = 17.62
c = 243.12

train['rh_d'] = (c * np.log1p(train.RH / 100) + (b * train['T'] / (c + train['T']))) / (b - (np.log1p(train.RH / 100) + b * train['T'] / (c + train['T'])))
test['rh_d'] = (c * np.log1p(test.RH / 100) + (b * test['T'] / (c + test['T']))) / (b - (np.log1p(test.RH / 100) + b * test['T'] / (c + test['T'])))

In [278]:
train['vp'] = 6.11 * 10 ** (7.5 * train['T'] / (train['T'] + 237.3))
test['vp'] = 6.11 * 10 ** (7.5 * test['T'] / (test['T'] + 237.3))

In [279]:
df_train = base_preprocess(train, -48, -96) # 1일, 2일 뒤 TARGET 
#df_train_day23 = base_preprocess(train, -96, -144) # 2일, 3일 뒤 TARGET

In [280]:
test['sep'] = test.index // 48 + 1

In [281]:
tr_times = []
for day in df_train.Day.unique() :
    d = df_train.loc[df_train.Day == day]
    try :
        start = d.loc[d.TARGET > 0].index[0]
        end = d.loc[d.TARGET > 0].index[-1]
        gap = (end - start) / 2
    except :
        gap = 0

    tr_times.append(gap)

In [282]:
test['sep'] = test.index // 48 # df_test 데이터의 경우 시계열 고려가 안되었기 때문에 48(하루 시간)으로 나눈 몫으로 day 구분자 생성


te_times = []
for day in test.sep.unique() :
    d = test.loc[test.sep == day]
    try :
        start = d.loc[d.TARGET > 0].index[0]
        end = d.loc[d.TARGET > 0].index[-1]
        gap = (end - start) / 2
    except :
        gap = 0

    te_times.append(gap)

In [283]:
df_train_sun_time = pd.DataFrame({'Day' : df_train.Day.unique(), 'sun_time' : tr_times})
df_test_sun_time = pd.DataFrame({'sep' : test.sep.unique(), 'sun_time' : te_times})

In [284]:
df_train = pd.merge(df_train, df_train_sun_time, how = 'left', on = 'Day')
df_test = pd.merge(test, df_test_sun_time, how = 'left', on = 'sep')#.drop('sep', axis = 1)

sin h = (sin δ * sin φ) + (cos δ * cos φ * cos H)

In [285]:
# 적위
df_train['dos'] = [-23.44 * np.cos(360 / 365 * (x + 10)) for x in df_train.Day]
df_test['dos'] = [-23.44 * np.cos(360 / 365 * (x + 10)) for x in df_test.sep_day]

In [286]:
# 시간각
df_train['high_angle'] = [(x - 12) * 15 if x >= 12 else -(12 - x) * 15 for x in df_train.Hour]
df_test['high_angle'] = [(x - 12) * 15 if x >= 12 else -(12 - x) * 15 for x in df_test.Hour]

In [287]:
df_train['zenith_angle'] = 90 - 1 / (np.sin(np.sin(df_train['dos']) * np.sin(36) + np.cos(df_train['dos']) *np.cos(36) * np.cos(df_train['high_angle'])))
df_test['zenith_angle'] = 90 - 1 / (np.sin(np.sin(df_test['dos']) * np.sin(36) + np.cos(df_test['dos']) *np.cos(36) * np.cos(df_test['high_angle'])))

In [288]:
df_train['GHI'] = np.cos(df_train.zenith_angle) * df_train.DNI + df_train.DHI
df_test['GHI'] = np.cos(df_test.zenith_angle) * df_test.DNI + df_test.DHI

In [289]:
r_cols = ['DHI', 'DNI', 'T', 'TARGET', 'sun_time', 'GHI', 'zenith_angle']
rolling_num = [49, 97, 146]
for rn in rolling_num :
    for c in r_cols :
        df_train[f'{c}_{rn}'] = df_train[c].rolling(rn).mean()
df_train.dropna(how = 'any', inplace = True)


data = pd.DataFrame()
for s in df_test.sep_day.unique() :
    df = df_test.loc[df_test.sep_day == s]
    for rn in rolling_num :
        for c in r_cols :
            df[f'{c}_{rn}'] = df[c].rolling(rn).mean()
        data = pd.concat([data, df], axis = 0, ignore_index = True)
data.dropna(how = 'any', inplace = True)
df_test = data.copy()

In [290]:
def kfold_lgbm_prediction(trainx, trainy, target, n) :
    lgbm_submission = pd.DataFrame(np.zeros((3888, 9)), columns = submission.columns[1:],
                                   index = submission.loc[submission.id.str.contains(f'y{n}')].index)
    
    kf = KFold(n_splits = 5, shuffle = True, random_state = 114)

    
    quantile = [.09, .19, .29, .39, .5, .6, .7, .8, .9]
    
    total_loss = []
    total_val_loss = []
    
    for i, (tr_idx, val_idx) in enumerate(kf.split(trainx, trainy)) :

        print('=' * 30)
        print(f'        {i + 1}번째 KFOLD 시작')

        nfold_loss = []
        nfold_val_loss = []

        tr_X, tr_y = trainx.iloc[tr_idx], trainy.iloc[tr_idx]
        val_X, val_y = trainx.iloc[val_idx], trainy.iloc[val_idx]
        
        p = 0
        for q in quantile:
            p += 1
            lgbm = LGBMRegressor(random_state = 114, objective = 'quantile', metric = 'quantile', alpha = q, max_depth = 5)
            lgbm.fit(tr_X, tr_y, eval_set = [(val_X, val_y)], eval_metric = 'quantile', early_stopping_rounds = 500, verbose = 300)
            best_loss = float(str(lgbm.best_score_['valid_0']).split(',')[-1][1:-3])
        

            nfold_loss.append(best_loss)
            
            total_loss.append(best_loss)
            
            lgbm_pred = [0 if x < 0 else x for x in lgbm.predict(target) / 5]
            lgbm_submission.iloc[:, p - 1] += lgbm_pred
        print(f'{i + 1}번째 KFOLD 평균 loss는 {np.mean(nfold_loss)}')

    print(f'전체 평균 loss는 {np.mean(total_loss)}')
    return lgbm_submission

In [291]:
X = df_train.drop(['Day','Minute','TARGET',  '1day_after_target', '2day_after_target'], axis = 1)
y1 = df_train['1day_after_target']
y2 = df_train['2day_after_target']

target = df_test.loc[df_test.Day == 6, X.columns]

In [292]:
lgbm7 = kfold_lgbm_prediction(X, y1, target, 7)

        1번째 KFOLD 시작
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 1.21599
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.01371
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.42103
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.58506
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.53382
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.27629
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]

Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.52243
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.67944
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.60655
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.33667
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 1.93786
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 1.43346
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 0.804464
4번째 KFOLD 평균 loss는 1.9524896156032798
        5번째 KFO

In [293]:
lgbm7.describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0
mean,7.525856,12.022982,15.175353,17.536051,19.541758,21.114489,22.571263,24.084013,25.594906
std,10.435018,16.373809,20.39355,23.332329,25.786492,27.625343,29.286878,30.647545,31.874649
min,0.0,0.0,0.0,0.0,0.0,0.0,0.000624,0.000868,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.000238,0.000624,0.001086,0.001809
50%,0.229784,0.464395,0.650556,1.176627,1.472437,1.920562,2.270877,2.95352,3.855774
75%,14.181651,22.974709,29.727946,34.055234,38.426185,42.001526,45.059218,49.25125,51.366157
max,43.505038,63.130019,73.540125,80.648015,87.078059,88.876583,93.679309,94.715351,96.670138


In [294]:
lgbm8 = kfold_lgbm_prediction(X, y2, target, 8)

        1번째 KFOLD 시작
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 1.25501
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.08052
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.5148
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.66682
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.62627
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.34979
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	

Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.54443
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.72208
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.64674
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 2.38597
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 1.97695
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 1.46331
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's quantile: 0.826955
4번째 KFOLD 평균 loss는 1.9894219042612105
        5번째 KFO

In [295]:
lgbm8.describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0,3888.0
mean,7.081376,11.530828,14.793027,17.536348,19.845769,21.476605,22.851905,24.358156,25.790457
std,9.929677,15.758531,19.702048,23.084613,25.981865,27.951788,29.515481,30.875605,31.979675
min,0.0,0.0,0.0,0.0,0.0,0.0,0.000624,0.0,0.001771
25%,0.0,0.0,0.0,0.0,0.0,0.000239,0.000624,0.001087,0.001771
50%,0.445097,0.576811,0.957792,1.214871,1.346432,1.807563,2.221239,2.832692,4.271856
75%,13.23488,22.32142,29.504054,35.33349,39.86853,43.151924,45.882598,49.921948,51.103362
max,47.855634,67.275867,74.343857,80.028295,86.900876,89.904101,92.720811,94.731157,96.49642


#### TEAM EDA

In [301]:
submission.loc[submission.id.str.contains('y7'), submission.columns[1:]] = lgbm7
submission.loc[submission.id.str.contains('y8'), submission.columns[1:]] = lgbm8

In [305]:
submission.loc[submission.id.str.contains('_0h|_1h|_2h|_3h|_4h|20h|21h|22h|23h'), submission.columns[1:]] = 0

In [306]:
submission.describe()

Unnamed: 0,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
count,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0,7776.0
mean,7.299411,11.771128,14.974334,17.524341,19.678802,21.281173,22.70306,24.202946,25.637304
std,10.189668,16.07345,20.056128,23.214073,25.89153,27.795489,29.405795,30.772789,31.966309
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.313311,0.449392,0.712724,1.120725,1.310773,1.765933,2.183618,2.763154,3.729849
75%,13.805595,22.739359,29.603031,34.824644,39.283885,42.512487,45.512835,49.606146,51.175829
max,47.855634,67.275867,74.343857,80.648015,87.078059,89.904101,93.679309,94.731157,96.670138


In [307]:
submission

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
0,0.csv_Day7_0h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.csv_Day7_0h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.csv_Day7_1h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.csv_Day7_1h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.csv_Day7_2h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7771,80.csv_Day8_21h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7772,80.csv_Day8_22h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7773,80.csv_Day8_22h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7774,80.csv_Day8_23h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [308]:
time_str = datetime.datetime.strftime(datetime.datetime.today(),'%Y%m%d_%H%M%S')
submission.to_csv(f'sun_{time_str}.csv', index=False)