In [None]:
import pandas as pd
import numpy as np
import os
import glob
import random

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor

In [None]:
train = pd.read_csv('../data/train/train.csv')
submission = pd.read_csv('../data/sample_submission.csv')

submission['time'] = submission['id'].apply(lambda x: x.split('_')[2].replace('h', '.').replace('m', '')).astype(float)

# 4시 30분 이전, 19시 30분 이후는 발전량 0으로 고정
zero_idx = submission.loc[(submission['time']<=4.3) | (submission['time']>=19.3)].index
submission.drop(columns='time', inplace=True)

In [None]:
def load(day_range=1):
    train_list = []
    tar_list = []
    for day in tqdm(range(day_range-1, int(train.shape[0]/48-2))):
        temp = train.loc[(train['Day']<=day) & (train['Day']>day-day_range), ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']]
        temp2 = []
        for col in ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']:
            temp2 += [temp[col].values.reshape(-1)]
        temp = np.concatenate(temp2)
        target = train.loc[train['Day']==day+1, 'TARGET'].values.tolist()
        target2 = train.loc[train['Day']==day+2, 'TARGET'].values.tolist()
        target += target2
        train_list += [temp]
        tar_list += [target]
    train_arr = np.concatenate([train_list], 1)
    tar_arr = np.concatenate([tar_list], 1)
    print(train_arr.shape, tar_arr.shape)

    test_list = []
    for i in tqdm(range(81)):
        file_path = '../data/test/' + str(i) + '.csv'
        temp = pd.read_csv(file_path)
        temp2 = []
        temp = temp.loc[(temp['Day']<=6) & (temp['Day']>6-day_range), ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']]
        for col in ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']:
            temp2 += [temp[col].values.reshape(-1)]
        temp = np.concatenate(temp2)
        test_list += [temp]
    test_arr = np.concatenate([test_list], 1)
    print(test_arr.shape)
    
    return train_arr, tar_arr, test_arr

In [None]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]


total_model = {}

for day_range in range(3, 8):
    train_arr, tar_arr, test_arr = load(day_range=day_range)

    #################### data preprocessing
    total_rep = []
    for idx in range(train_arr.shape[0]):
        rep = []
        for i in range(96):
            rep += [train_arr[idx, :]]
        rep = np.concatenate([rep, np.array([i for i in range(96)]).reshape(-1, 1)], 1)
        total_rep += [rep]

    train_arr = np.concatenate(total_rep)
    tar_arr = tar_arr.reshape(-1)

    total_rep = []
    for idx in range(test_arr.shape[0]):
        rep = []
        for i in range(96):
            rep += [test_arr[idx, :]]
        rep = np.concatenate([rep, np.array([i for i in range(96)]).reshape(-1, 1)], 1)
        total_rep += [rep]
    test_arr = np.concatenate(total_rep)

    print(train_arr.shape, tar_arr.shape, test_arr.shape)

    #################### nunique features > 1
    not_unique_features = (pd.DataFrame(train_arr).nunique()>1).values
    train_arr = train_arr[:, not_unique_features]
    test_arr = test_arr[:, not_unique_features]

    print(train_arr.shape, tar_arr.shape, test_arr.shape)

    kf = KFold(n_splits=5, random_state=0, shuffle=False)
    splits = [[trn_idx, val_idx] for trn_idx, val_idx in kf.split(train_arr)]
    #################### modeling
    model_dict = {}
    for fold in range(5):
        models = []
        tr = train_arr[splits[fold][0]]
        val = train_arr[splits[fold][1]]

        tr_target = tar_arr[splits[fold][0]]
        val_target = tar_arr[splits[fold][1]]
        
        for q in quantiles:
            print(f'day_range:{day_range}, fold:{fold}, q:{q}')

            lgbm = LGBMRegressor(objective='quantile',
                                alpha=q,
                                n_estimators=20000, 
                                max_depth=8,
                                learning_rate=0.1, 
                                subsample=0.5,
                                reg_alpha=0.01,
                                reg_lambda=0.01,
                                random_state=0,
                                n_jobs=8)

            lgbm.fit(tr, tr_target, eval_metric = ['quantile'], 
                                    eval_set=[(tr, tr_target), (val, val_target)], early_stopping_rounds=200, verbose=5000)
            
            importance_features = lgbm.feature_importances_>1
            # importance_features = np.argsort(lgbm.feature_importances_)[::-1][:200]
            print(sum(importance_features))
            lgbm.fit(tr[:, importance_features], tr_target, eval_metric = ['quantile'], 
                                    eval_set=[(tr[:, importance_features], tr_target), (val[:, importance_features], val_target)], early_stopping_rounds=200, verbose=1000)

            models += [lgbm, importance_features]
            print('\n')
            
        model_dict[fold] = models
    total_model[day_range] = model_dict

In [None]:
np.save('../submit/model/lgb_models.npy', total_model)

In [None]:
total_model = np.load('../submit/model/lgb_models.npy', allow_pickle=True).item()

In [None]:
day_quantile_preds = {}
s_day_quantile_preds = {}

for day_range in range(3, 8):
    train_arr, tar_arr, test_arr = load(day_range=day_range)

    #################### data preprocessing
    total_rep = []
    for idx in range(train_arr.shape[0]):
        rep = []
        for i in range(96):
            rep += [train_arr[idx, :]]
        rep = np.concatenate([rep, np.array([i for i in range(96)]).reshape(-1, 1)], 1)
        total_rep += [rep]

    train_arr = np.concatenate(total_rep)
    tar_arr = tar_arr.reshape(-1)

    total_rep = []
    for idx in range(test_arr.shape[0]):
        rep = []
        for i in range(96):
            rep += [test_arr[idx, :]]
        rep = np.concatenate([rep, np.array([i for i in range(96)]).reshape(-1, 1)], 1)
        total_rep += [rep]
    test_arr = np.concatenate(total_rep)

    print(train_arr.shape, tar_arr.shape, test_arr.shape)

    #################### nunique features > 1
    not_unique_features = (pd.DataFrame(train_arr).nunique()>1).values
    train_arr = train_arr[:, not_unique_features]
    test_arr = test_arr[:, not_unique_features]
    print(train_arr.shape, tar_arr.shape, test_arr.shape)

    kf = KFold(n_splits=5, random_state=0, shuffle=False)
    splits = [[trn_idx, val_idx] for trn_idx, val_idx in kf.split(train_arr)]
    
    s_fold_quantile_preds = np.zeros([len(test_arr), 9])
    for fold in range(5):
        s_quantile_preds = []
        print(f'day_range:{day_range}, fold:{fold}')
        for i, q in enumerate(quantiles):
            model = total_model[day_range][fold][i*2]
            importance_features = total_model[day_range][fold][i*2+1]
            s_quantile_preds += [model.predict(test_arr[:,importance_features])]
        print('\n')
        s_fold_quantile_preds += np.concatenate([s_quantile_preds], 1).T/5
    s_day_quantile_preds[day_range] = s_fold_quantile_preds

In [None]:
submission.iloc[:, 1:] = np.mean([s_day_quantile_preds[3], s_day_quantile_preds[4], s_day_quantile_preds[5], s_day_quantile_preds[6], s_day_quantile_preds[7]], 0).round(2)
submission.loc[zero_idx, 'q_0.1':]=0

In [None]:
submission.to_csv('../submit/lgb_model.csv', index=False)