In [None]:
import os
import random

import pandas as pd
import numpy as np

import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, Pool

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error

import eli5 
from eli5.sklearn import PermutationImportance

                                               
import warnings

def set_seed(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

warnings.filterwarnings('ignore')

In [None]:
def target_process(data):
    st = data.copy()
    
    if num==1:
        st.loc[204, 'use_electric'] = np.nan
        st.loc[1033, 'use_electric'] = np.nan
        st = st.interpolate(limit_direction='both', method='linear').round(3)
        st = st[st['date']>='2020-06-05'].reset_index(drop=True)
    elif num==3:
        st.loc[[538, 945, 946, 947, 1055, 1523, 1524, 1525, 1526, 1535, 1634, 1635, 1636, 1637, 1639, 1640, 1641, 1642, 1643, 1718, 1761, 1762, 1790], 'use_electric'] = np.nan
        st = st.interpolate(limit_direction='both', method='linear').round(3)
        st.loc[(st['date']>='2020-07-15') & (st['date']<'2020-07-20'), 'use_electric'] = (st.loc[(st['date']>='2020-07-08') & (st['date']<'2020-07-13'), 'use_electric'].values*.7 +
                                                                                            st.loc[(st['date']>='2020-08-05') & (st['date']<'2020-08-10'), 'use_electric'].values*.3)
        st.loc[[1055], 'use_electric'] = np.nan
        st = st.interpolate(limit_direction='both', method='linear').round(3)
        st.loc[(st['date']>='2020-08-03') & (st['date']<'2020-08-05'), 'use_electric'] = (st.loc[(st['date']>='2020-07-13') & (st['date']<'2020-07-15'), 'use_electric'].values*.2 + 
                                                                                            st.loc[(st['date']>='2020-08-10') & (st['date']<'2020-08-12'), 'use_electric'].values*.8)

        st.loc[(st['date']>='2020-07-20') & (st['date']<'2020-07-27'), 'use_electric'] = (st.loc[(st['date']>='2020-07-13') & (st['date']<'2020-07-20'), 'use_electric'].values*.6 +
                                                                                            st.loc[(st['date']>='2020-08-03') & (st['date']<'2020-08-10'), 'use_electric'].values*.4)

        st.loc[(st['date']>='2020-07-27') & (st['date']<'2020-08-03'), 'use_electric'] = (st.loc[(st['date']>='2020-07-20') & (st['date']<'2020-07-27'), 'use_electric'].values*.2 + 
                                                                                            st.loc[(st['date']>='2020-08-03') & (st['date']<'2020-08-10'), 'use_electric'].values*.8)
    elif num==9:
        st.loc[[82, 1427], 'use_electric'] = np.nan
        st.loc[1087:1097, 'use_electric'] = st.loc[1087:1096, 'use_electric'] + 200
        st.loc[1210:1213, 'use_electric'] = st.loc[1210:1213, 'use_electric'] + 200
        st.loc[(st['date']>='2020-06-03 06') & (st['date']<'2020-06-03 17'), 'use_electric'] = np.nan
    elif num==10:
        st.loc[1462, 'use_electric'] = np.nan
        st.loc[1893:1894, 'use_electric'] = np.nan
        st = st.interpolate(limit_direction='both', method='linear').round(3)
        st.loc[(st['date']>'2020-07-27 08') & (st['date']<'2020-07-27 21'), 'use_electric'] = (st.loc[(st['date']>'2020-07-26 08') & (st['date']<'2020-07-26 21'), 'use_electric'].values*0.3 + 
                                                                                                st.loc[(st['date']>'2020-07-28 08') & (st['date']<'2020-07-28 21'), 'use_electric'].values*0.7)
        st.loc[(st['date']>'2020-08-10 08') & (st['date']<'2020-08-10 21'), 'use_electric'] = (st.loc[(st['date']>'2020-08-09 08') & (st['date']<'2020-08-09 21'), 'use_electric'].values*0.3 + 
                                                                                                st.loc[(st['date']>'2020-08-11 08') & (st['date']<'2020-08-11 21'), 'use_electric'].values*0.7)
    elif num==14:
        st.loc[494, 'use_electric'] = np.nan
    elif num==15:
        st.loc[1760, 'use_electric'] = np.nan
    elif num==16:
        st.loc[[634, 1883], 'use_electric'] = np.nan
    elif num==24:
        st = st.loc[72:].reset_index(drop=True)
    elif num==25:
        st.loc[993:995, 'use_electric'] = np.nan
        st.loc[(st['date']>='2020-07-27') & (st['date']<'2020-08-01'), 'use_electric'] = (st.loc[(st['date']>='2020-07-20') & (st['date']<'2020-07-25'), 'use_electric'].values*.4 +
                                                                                                                st.loc[(st['date']>='2020-08-03') & (st['date']<'2020-08-08'), 'use_electric'].values*.6)
    elif num==27:    
        st.loc[1644:1648, 'use_electric'] = np.nan
    elif (num==31) | (num==33):
        st.loc[257, 'use_electric'] = np.nan
    elif num==36:
        st.loc[438:439, 'use_electric'] = np.nan
        st.loc[1733:1736, 'use_electric'] = np.nan
    elif num==40:
        st.loc[(st['date']>='2020-08-03') & (st['date']<'2020-08-05'), 'use_electric'] = (st.loc[(st['date']>='2020-07-27') & (st['date']<'2020-07-29'), 'use_electric'].values*.4 +
                                                                                                                st.loc[(st['date']>='2020-08-10') & (st['date']<'2020-08-12'), 'use_electric'].values*.6)
    elif num==42:
        st.loc[(st['date']>='2020-07-13') & (st['date']<'2020-07-14'), 'use_electric'] = (st.loc[(st['date']>='2020-07-06') & (st['date']<'2020-07-07'), 'use_electric'].values*.4 +
                                                                                                                st.loc[(st['date']>='2020-07-20') & (st['date']<'2020-07-21'), 'use_electric'].values*.6)
        st.loc[(st['date']>='2020-08-10') & (st['date']<'2020-08-11'), 'use_electric'] = (st.loc[(st['date']>='2020-08-03') & (st['date']<'2020-08-04'), 'use_electric'].values*.4 +
                                                                                                                st.loc[(st['date']>='2020-08-17') & (st['date']<'2020-08-18'), 'use_electric'].values*.6)
    elif num==45:
        st.loc[817, 'use_electric'] = np.nan
        st['use_electric'] = st['use_electric'].interpolate(limit_direction='both', method='linear').round(3)
        st.loc[(st['date']>='2020-08-15') & (st['date']<'2020-08-16'), 'use_electric'] = (st.loc[(st['date']>='2020-08-08') & (st['date']<'2020-08-09'), 'use_electric'].values*0.4 +
                                                                                                            st.loc[(st['date']>='2020-08-22') & (st['date']<'2020-08-23'), 'use_electric'].values*0.6)
    elif num==52:
        st.loc[258, 'use_electric'] = np.nan
    elif num==53:
        st = st.loc[187:].reset_index(drop=True)
    elif num==55:
        st.loc[1643, 'use_electric'] = np.nan
        st.loc[1648:1649, 'use_electric'] = np.nan
        st = st.interpolate(limit_direction='both', method='linear').round(3)
        st.loc[(st['date']>'2020-8-3 00') & (st['date']<='2020-8-7 23'), 'use_electric'] = st.loc[(st['date']>'2020-7-27 00') & (st['date']<='2020-7-31 23'), 'use_electric'].values*.3 + st.loc[(st['date']>'2020-8-10 00') & (st['date']<='2020-8-14 23'), 'use_electric'].values*.7
    elif num==56:
        st.loc[(st['date']>'2020-8-3 00') & (st['date']<='2020-8-7 23'), 'use_electric'] = st.loc[(st['date']>'2020-7-27 00') & (st['date']<='2020-7-31 23'), 'use_electric'].values*.3 + st.loc[(st['date']>'2020-8-10 00') & (st['date']<='2020-8-14 23'), 'use_electric'].values*.7
    elif num==59:
        st.loc[[1816, 1819, 1833, 1834], 'use_electric'] = np.nan
    elif num==60:
        st.loc[384:387, 'use_electric'] = np.nan
        st.loc[605:606, 'use_electric'] = np.nan
        st.loc[720:721, 'use_electric'] = np.nan
        st.loc[792, 'use_electric'] = np.nan
    st['use_electric'] = st['use_electric'].interpolate(limit_direction='both', method='linear').round(3)
        
    if num in [2,3,6,7,8,9,13,16,17,18,22,23,24,25,26,27,31,33,34,35,37,43,44,46,47,48,52,53,54,55,56,57,58]:
        st.loc[(st['date']>='2020-08-17 00') & (st['date']<'2020-08-18 00'), 'use_electric'] = (st.loc[(st['date']>='2020-08-10 00') & (st['date']<'2020-08-11 00'), 'use_electric'].values*.5 + st.loc[(st['date']>='2020-08-24 00') & (st['date']<'2020-08-25 00'), 'use_electric'].values*.5).round(3)
    
    return st

In [None]:
def mae(preds, target):
    return mean_absolute_error(target, preds)
def mse(preds, target):
    return mean_squared_error(target, preds)
def smape(preds, target):
    '''
    Function to calculate SMAPE
    '''
    n = len(preds)
    masked_arr = ~((preds==0)&(target==0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds-target)
    denom = np.abs(preds)+np.abs(target)
    smape_val = (200*np.sum(num/denom))/n
    return smape_val

def lgbm_smape(preds, train_data):
    '''
    Custom Evaluation Function for LGBM
    '''
    # labels = train_data.get_label()
    labels = train_data
    smape_val = smape(preds, labels)
    return 'SMAPE', smape_val, False

def lgbm_smape_exp(preds, train_data):
    '''
    Custom Evaluation Function for LGBM
    '''
    # labels = train_data.get_label()
    labels = train_data
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False

# function for feature engineering
def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return ys

def lgb_models(df_train, df_test, drop_col, target, category_features=[], learn='lasso', v=5000, permu=False, exp=True, gpu=False):
    train = df_train.copy()
    test = df_test.copy()
    TARGET = target
    FOLDS = train['fold_num'].nunique()
    RANDOM_STATE = 0

    oof = np.zeros([len(train)])
    pred = np.zeros([len(test)])

    features = [col for col in test.columns if col not in drop_col]
    feature_importances = np.zeros(len(features))
    print('use_feature : ', features)
    for idx in np.unique(train['fold_num']):
        trn_idx = train[train['fold_num']!=idx].index
        val_idx = train[train['fold_num']==idx].index
        
        X_train = train[features]
        X_test = test[features]
        y_train = train[TARGET].values
        tt = (X_train.loc[trn_idx], y_train[trn_idx])
        vv = (X_train.loc[val_idx], y_train[val_idx])

        if learn=='lgb':
            reg = LGBMRegressor(
                                boosting_type='gbdt', #['gbdt', 'dart', 'goss']
                                objective='regression', 
                                # metrics='mse', 
                                n_estimators=20000,
                                max_depth=8,
                                learning_rate=0.03,
                                colsample_bytree=0.9,
                                subsample=0.7,
                                num_leaves=256,
                                reg_alpha=0.01,
                                reg_lambda=0.01,
                                n_jobs=-1,
                                random_state=RANDOM_STATE,
                                )
            if exp:
                reg.fit(tt[0], tt[1], eval_set=[tt, vv], eval_metric=lgbm_smape_exp, early_stopping_rounds=500, verbose=v, categorical_feature=category_features, )
            else:
                reg.fit(tt[0], tt[1], eval_set=[tt, vv], eval_metric=lgbm_smape, early_stopping_rounds=500, verbose=v, categorical_feature=category_features, )
                
            feature_importances += reg.feature_importances_ / FOLDS
        elif learn=='xgb':
            reg = XGBRegressor(
                                objective='reg:squarederror', 
                                # metrics='mse', 
                                n_estimators=20000,
                                max_depth=8,
                                learning_rate=0.03,
                                colsample_bytree=0.9,
                                subsample=0.7,
                                reg_alpha=0.01,
                                reg_lambda=0.01,
                                n_jobs=-1,
                                random_state=RANDOM_STATE,
                                )
            reg.fit(tt[0], tt[1], eval_set=[tt, vv], early_stopping_rounds=500, verbose=v)

        oof[val_idx] = reg.predict(X_train.loc[val_idx])
        pred += reg.predict(X_test) / FOLDS

        if v>0: print(idx+1, 'fold complete ################################\n')
    
    if exp:
        print('mae : ', mae(np.expm1(oof), np.expm1(train[TARGET])), 'mse : ', mse(np.expm1(oof), np.expm1(train[TARGET])), 'smape :', smape(np.expm1(oof), np.expm1(train[TARGET])))
    else:
        print('mae : ', mae(oof, train[TARGET]), 'mse : ', mse(oof, train[TARGET]), 'smape :', smape(oof, train[TARGET]))
    feature_importances = pd.DataFrame({'feature':features, 'value':feature_importances}).sort_values('value', ascending=False).reset_index(drop=True)
    return oof, pred, feature_importances

In [None]:
df_train = pd.read_csv('data/train.csv', engine='python', encoding='cp949')
df_test = pd.read_csv('data/test.csv', engine='python', encoding='cp949')
sub = pd.read_csv('data/sample_submission.csv', engine='python', encoding='cp949')

# ['num', 'date_time', '전력사용량(kWh)', '기온(°C)', '풍속(m/s)', '습도(%)', '강수량(mm)', '일조(hr)', '비전기냉방설비운영', '태양광보유']
df_train.columns = ['num', 'date', 'use_electric', 'temperature', 'wind_speed', 'humidity', 'precipitation', 'sunshine', 'operation', 'solar_power']
df_test.columns = ['num', 'date', 'temperature', 'wind_speed', 'humidity', 'precipitation', 'sunshine', 'operation', 'solar_power']

In [None]:
train = df_train.copy()
test = df_test.copy()

# 건물별로 '비전기냉방설비운영'과 '태양광보유'를 판단해 test set의 결측치를 보간해줍니다
test_fill = test.copy()
sp_dict = train[['num', 'solar_power']].drop_duplicates().set_index('num').to_dict()['solar_power']
test['solar_power'] = test['num'].map(sp_dict)

op_dict = train[['num', 'operation']].drop_duplicates().set_index('num').to_dict()['operation']
test['operation'] = test['num'].map(sp_dict)

for num in range(1, 61):
    for col in ['temperature', 'wind_speed', 'humidity', 'precipitation', 'sunshine']:
        test.loc[test['num']==num, col] = test.loc[test['num']==num, col].interpolate().round(1)
        
for df in [train, test]:
    df['date'] = pd.to_datetime(df['date'])
    df['date2'] = df['date'].dt.date.astype(str)
    df['month'] = df['date'].dt.month - 6
    df['week'] = (df['date'].dt.isocalendar()['week'] - 23).astype(int)
    df['day'] = df['month']*df['month'].map({0:30, 1:31, 2:31}) + df['date'].dt.day
    df['hour'] = df['date'].dt.hour
    df['weekday'] = df['date'].dt.weekday
    df['weekend'] = (df['weekday']>=5).astype(int)
    df['weekend2'] = df['weekday'].map({0:0, 1:0, 2:0, 3:0, 4:0, 5:1, 6:2})
    df['holiday'] = df['date2'].isin(['2020-06-06', '2020-08-15', '2020-08-17']).astype(int)
    df['weekend_holiday'] = ((df['weekend'] + df['holiday'])>0).astype(int)
#     df.loc[df['holiday']==1, 'weekend2'] = 2
    
    ############## 화씨온도
    df['temperature_F'] = (df['temperature'] * 9/5) + 32 

    ############## 체감온도, https://www.weather.go.kr/plus/life/li_asset/HELP/basic/help_01_07.jsp
    df['temperature2'] = 13.12 + 0.6215*df['temperature'] - 11.37*(df['wind_speed']*3.6)**0.16 + 0.3965*(df['wind_speed']*3.6)**0.16*df['temperature']
#     df['diff_temp'] = df['temperature2'] - df['temperature']

    ############## 열지수, https://www.weather.go.kr/weather/lifenindustry/li_asset/HELP/basic/help_01_04.jsp
    T = df['temperature_F']
    RH = df['humidity']
    df['heat_index'] = -42.379 + 2.04901523*T + 10.14333127*RH - .22475541*T*RH - .00683783*T*T - .05481717*RH*RH + .00122874*T*T*RH + .00085282*T*RH*RH - .00000199*T*T*RH*RH
    df['heat_index'] = (df['heat_index']-32) * 5/9
    df.loc[df['heat_index']<32, 'heat_index'] = 0
    df.loc[(df['heat_index']>=32) & (df['heat_index']<41), 'heat_index'] = 1
    df.loc[(df['heat_index']>=41) & (df['heat_index']<54), 'heat_index'] = 2
    df.loc[(df['heat_index']>=54) & (df['heat_index']<66), 'heat_index'] = 3
    df.loc[df['heat_index']>=66, 'heat_index'] = 4
    
    ############## working hour
    df['work_hour'] = ((df['hour']>=8) & (df['hour']<=19)).astype(int)
    df['lunch_hour'] = ((df['hour']>=11) & (df['hour']<=13) & (df['weekday']<=4)).astype(int)
    df['lunch_hour2'] = ((df['hour']>=12) & (df['hour']<=14) & (df['weekday']>4)).astype(int)
    
    df['dinner_hour'] = ((df['hour']>=17) & (df['hour']<=22)).astype(int)
    df['dinner_hour2'] = ((df['hour']>=18) & (df['weekday']>=4) & (df['weekday']<=5)).astype(int)
    
#     df['religion'] = ((df['hour']>=9) & (df['weekday']<13) & (df['weekday']==6)).astype(int)

    ############## 불쾌지수
    df['THI'] = 9/5*df['temperature'] - 0.55*(1-df['humidity']/100)*(9/5*df['temperature']-26)+32
    df.loc[df['THI']<68, 'THI'] = 0
    df.loc[(df['THI']>=68) & (df['THI']<75), 'THI'] = 1
    df.loc[(df['THI']>=75) & (df['THI']<80), 'THI'] = 2
    df.loc[(df['THI']>=80), 'THI'] = 3
    
    ############## CDH
    cdhs = []
    for num in range(1, 61):
        temp = df[df['num'] == num]
        cdh = CDH(temp['temperature'].values)
        cdhs += cdh
    else:
        df['CDH'] = cdhs

test['sunshine'] = np.where(test['sunshine']>1.0, 1.0, test['sunshine'])

In [None]:
df = pd.DataFrame()
for num in range(1, 61):
    st = train[train['num']==num].reset_index(drop=True)
    st = target_process(st)
    df = pd.concat([df, st])
train = df.reset_index(drop=True)

In [None]:
# subset models
oof_lgb = {}
pred_lgb = {}
fea_imp_lgb = {}
oof_xgb = {}
pred_xgb = {}
fea_imp_xgb = {}

for num in range(1, 61):
    st = train[train['num']==num].reset_index(drop=True)
    sub_test = test[test['num']==num].reset_index(drop=True)
    drop_cols = ['use_electric', 'num', 'date', 'date2', 'wind_speed', 'precipitation', 'sunshine', 'operation', 'solar_power', 'week', 'weekend', 'weekend_holiday', 'temperature_F', 'temperature2', 'holiday', 'fold_num', 'cluster', 'oof']

    new_col = 'mean_use_electric' 
    st[new_col] = st['weekend'].astype(str) + '_' + st['hour'].astype(str)
    sub_test[new_col] = sub_test['weekend'].astype(str) + '_' + sub_test['hour'].astype(str)
    sub_test[new_col] = sub_test[new_col].map(st.groupby(new_col)['use_electric'].mean())
    st[new_col] = st[new_col].map(st.groupby(new_col)['use_electric'].mean())

    st['use_electric'] = np.log1p(st['use_electric'])
    st['fold_num'] = (st['day']//7)

    print('######################################################################## Learning', num)
    oof_lgb[num], pred_lgb[num], fea_imp_lgb[num] = lgb_models(st, sub_test, drop_col=drop_cols, 
                                                               target='use_electric', 
                                                               category_features=['month', 'THI', 'weekend2'],
                                                               learn='lgb', v=0, permu=False, gpu=False)
    
    oof_xgb[num], pred_xgb[num], fea_imp_xgb[num] = lgb_models(st, sub_test, drop_col=drop_cols, 
                                                               target='use_electric', 
                                                               learn='xgb', v=0, permu=False, gpu=False)
    
    print('\n')

# copy
oof_lgb678 = oof_lgb.copy()
oof_xgb678 = oof_xgb.copy()
pred_lgb678 = pred_lgb.copy()
pred_xgb678 = pred_xgb.copy()

for num in range(1, 61):
    st = train[train['num']==num].reset_index(drop=True)
    sub_test = test[test['num']==num].reset_index(drop=True)
    drop_cols = ['use_electric', 'num', 'date', 'date2', 'wind_speed', 'precipitation', 'sunshine', 'operation', 'solar_power', 'week', 'weekend', 'weekend_holiday', 'temperature_F', 'temperature2', 'holiday', 'fold_num', 'cluster', 'oof']

    new_col = 'mean_use_electric' 
    st[new_col] = st['weekend'].astype(str) + '_' + st['hour'].astype(str)
    sub_test[new_col] = sub_test['weekend'].astype(str) + '_' + sub_test['hour'].astype(str)
    sub_test[new_col] = sub_test[new_col].map(st.groupby(new_col)['use_electric'].mean())
    st[new_col] = st[new_col].map(st.groupby(new_col)['use_electric'].mean())
    
    st = st[st['date']>='2020-07-01'].reset_index(drop=True)
    sub_test['day'] = sub_test['day'] - st['day'].min()
    st['day'] = st['day'] - st['day'].min()  

    st['use_electric'] = np.log1p(st['use_electric'])
    st['fold_num'] = (st['day']//7)

    print('######################################################################## Learning', num)
    oof_lgb[num], pred_lgb[num], fea_imp_lgb[num] = lgb_models(st, sub_test, drop_col=drop_cols, 
                                                               target='use_electric', 
                                                               category_features=['month', 'THI', 'weekend2'],
                                                               learn='lgb', v=0, permu=False, gpu=False)
    
    oof_xgb[num], pred_xgb[num], fea_imp_xgb[num] = lgb_models(st, sub_test, drop_col=drop_cols, 
                                                               target='use_electric', 
                                                               learn='xgb', v=0, permu=False, gpu=False)
    
    print('\n')
    
oof_lgb78 = oof_lgb.copy()
oof_xgb78 = oof_xgb.copy()
pred_lgb78 = pred_lgb.copy()
pred_xgb78 = pred_xgb.copy()

for num in range(1, 61):
    st = train[train['num']==num].reset_index(drop=True)
    sub_test = test[test['num']==num].reset_index(drop=True)
    drop_cols = ['month', 'use_electric', 'num', 'date', 'date2', 'wind_speed', 'precipitation', 'sunshine', 'operation', 'solar_power', 'week', 'weekend', 'weekend_holiday', 'temperature_F', 'temperature2', 'holiday', 'fold_num', 'cluster', 'oof']

    new_col = 'mean_use_electric' 
    st[new_col] = st['weekend'].astype(str) + '_' + st['hour'].astype(str)
    sub_test[new_col] = sub_test['weekend'].astype(str) + '_' + sub_test['hour'].astype(str)
    sub_test[new_col] = sub_test[new_col].map(st.groupby(new_col)['use_electric'].mean())
    st[new_col] = st[new_col].map(st.groupby(new_col)['use_electric'].mean())
    
    st = st[st['date']>='2020-08-01'].reset_index(drop=True)
    sub_test['day'] = sub_test['day'] - st['day'].min()
    st['day'] = st['day'] - st['day'].min()

    st['use_electric'] = np.log1p(st['use_electric'])
    st['fold_num'] = (st['day']//7)

    print('######################################################################## Learning', num)
    oof_lgb[num], pred_lgb[num], fea_imp_lgb[num] = lgb_models(st, sub_test, drop_col=drop_cols, 
                                                               target='use_electric', 
                                                               category_features=['THI', 'weekend2'],
                                                               learn='lgb', v=0, permu=False, gpu=False)
    
    oof_xgb[num], pred_xgb[num], fea_imp_xgb[num] = lgb_models(st, sub_test, drop_col=drop_cols, 
                                                               target='use_electric', 
                                                               learn='xgb', v=0, permu=False, gpu=False)
    
oof_lgb8 = oof_lgb.copy()
oof_xgb8 = oof_xgb.copy()
pred_lgb8 = pred_lgb.copy()
pred_xgb8 = pred_xgb.copy()

In [None]:
# 6,7,8월
print(
    smape(np.expm1(np.concatenate([oof_lgb678[num] for num in range(1, 61)])), train['use_electric']),
    smape(np.expm1(np.concatenate([oof_xgb678[num] for num in range(1, 61)])), train['use_electric']),
    smape(np.expm1(np.concatenate([oof_lgb678[num]*.7 + oof_xgb678[num]*.3 for num in range(1, 61)])), train['use_electric']))

In [None]:
# 7,8월
print(
    smape(np.expm1(np.concatenate([oof_lgb78[num] for num in range(1, 61)])), train.loc[train['date']>='2020-07-01', 'use_electric'].values),
    smape(np.expm1(np.concatenate([oof_xgb78[num] for num in range(1, 61)])), train.loc[train['date']>='2020-07-01', 'use_electric'].values),
    smape(np.expm1(np.concatenate([oof_lgb78[num]*.7 + oof_xgb78[num]*.3 for num in range(1, 61)])), train.loc[train['date']>='2020-07-01', 'use_electric'].values))

In [None]:
# 8월
print(
    smape(np.expm1(np.concatenate([oof_lgb8[num] for num in range(1, 61)])), train.loc[train['date']>='2020-08-01', 'use_electric'].values),
    smape(np.expm1(np.concatenate([oof_xgb8[num] for num in range(1, 61)])), train.loc[train['date']>='2020-08-01', 'use_electric'].values),
    smape(np.expm1(np.concatenate([oof_lgb8[num]*.7 + oof_xgb8[num]*.3 for num in range(1, 61)])), train.loc[train['date']>='2020-08-01', 'use_electric'].values))

In [None]:
subset_best = ((np.concatenate(list(pred_lgb678.values()))*.7 + np.concatenate(list(pred_xgb678.values()))*.3)*.45 +
                (np.concatenate(list(pred_lgb78.values()))*.7 + np.concatenate(list(pred_xgb78.values()))*.3)*.45 +
                (np.concatenate(list(pred_lgb8.values()))*.7 + np.concatenate(list(pred_xgb8.values()))*.3)*.1)

In [None]:
train = df_train.copy()
test = df_test.copy()

# 건물별로 '비전기냉방설비운영'과 '태양광보유'를 판단해 test set의 결측치를 보간해줍니다
test_fill = test.copy()
sp_dict = train[['num', 'solar_power']].drop_duplicates().set_index('num').to_dict()['solar_power']
test['solar_power'] = test['num'].map(sp_dict)

op_dict = train[['num', 'operation']].drop_duplicates().set_index('num').to_dict()['operation']
test['operation'] = test['num'].map(sp_dict)

for num in range(1, 61):
    for col in ['temperature', 'wind_speed', 'humidity', 'precipitation', 'sunshine']:
        test.loc[test['num']==num, col] = test.loc[test['num']==num, col].interpolate().round(1)
        
for df in [train, test]:
    df['date'] = pd.to_datetime(df['date'])
    df['date2'] = df['date'].dt.date.astype(str)
    df['month'] = df['date'].dt.month - 6
    df['week'] = (df['date'].dt.isocalendar()['week'] - 23).astype(int)
    df['day'] = df['month']*df['month'].map({0:30, 1:31, 2:31}) + df['date'].dt.day
    df['hour'] = df['date'].dt.hour
    df['hour2'] = df['date'].dt.hour//6
    df['weekday'] = df['date'].dt.weekday
    df['weekend'] = (df['weekday']>=5).astype(int)
    df['weekend2'] = df['weekday'].map({0:0, 1:0, 2:0, 3:0, 4:0, 5:1, 6:2})
    df['holiday'] = df['date2'].isin(['2020-06-06', '2020-08-15', '2020-08-17']).astype(int)
    df['weekend_holiday'] = ((df['weekend'] + df['holiday'])>0).astype(int)
#     df.loc[df['holiday']==1, 'weekend2'] = 2
    
    ############## 화씨온도
    df['temperature_F'] = (df['temperature'] * 9/5) + 32 

    ############## 체감온도, https://www.weather.go.kr/plus/life/li_asset/HELP/basic/help_01_07.jsp
    df['temperature2'] = 13.12 + 0.6215*df['temperature'] - 11.37*(df['wind_speed']*3.6)**0.16 + 0.3965*(df['wind_speed']*3.6)**0.16*df['temperature']
#     df['diff_temp'] = df['temperature2'] - df['temperature']

    ############## 열지수, https://www.weather.go.kr/weather/lifenindustry/li_asset/HELP/basic/help_01_04.jsp
    T = df['temperature_F']
    RH = df['humidity']
    df['heat_index'] = -42.379 + 2.04901523*T + 10.14333127*RH - .22475541*T*RH - .00683783*T*T - .05481717*RH*RH + .00122874*T*T*RH + .00085282*T*RH*RH - .00000199*T*T*RH*RH
    df['heat_index'] = (df['heat_index']-32) * 5/9
    df.loc[df['heat_index']<32, 'heat_index'] = 0
    df.loc[(df['heat_index']>=32) & (df['heat_index']<41), 'heat_index'] = 1
    df.loc[(df['heat_index']>=41) & (df['heat_index']<54), 'heat_index'] = 2
    df.loc[(df['heat_index']>=54) & (df['heat_index']<66), 'heat_index'] = 3
    df.loc[df['heat_index']>=66, 'heat_index'] = 4
    
    ############## working hour
    df['work_hour'] = ((df['hour']>=8) & (df['hour']<=19)).astype(int)
    df['lunch_hour'] = ((df['hour']>=11) & (df['hour']<=13) & (df['weekday']<=4)).astype(int)
    df['lunch_hour2'] = ((df['hour']>=12) & (df['hour']<=14) & (df['weekday']>4)).astype(int)
    
    df['dinner_hour'] = ((df['hour']>=17) & (df['hour']<=22)).astype(int)
    df['dinner_hour2'] = ((df['hour']>=18) & (df['weekday']>=4) & (df['weekday']<=5)).astype(int)
    
#     df['religion'] = ((df['hour']>=9) & (df['weekday']<13) & (df['weekday']==6)).astype(int)

    ############## 불쾌지수
    df['THI'] = 9/5*df['temperature'] - 0.55*(1-df['humidity']/100)*(9/5*df['temperature']-26)+32
    df.loc[df['THI']<68, 'THI'] = 0
    df.loc[(df['THI']>=68) & (df['THI']<75), 'THI'] = 1
    df.loc[(df['THI']>=75) & (df['THI']<80), 'THI'] = 2
    df.loc[(df['THI']>=80), 'THI'] = 3
    
    ############## CDH
    cdhs = []
    for num in range(1, 61):
        temp = df[df['num'] == num]
        cdh = CDH(temp['temperature'].values)
        cdhs += cdh
    else:
        df['CDH'] = cdhs

test['sunshine'] = np.where(test['sunshine']>1.0, 1.0, test['sunshine'])

pivot = pd.pivot_table(data=train, values='use_electric', index='num', columns='date').values
scaler = MinMaxScaler()
pivot = scaler.fit_transform(pivot.T).T

km = KMeans(n_clusters=5, random_state=42, max_iter=1000, )
km.fit(pivot)
cluster_dict = {num+1:cluster for num, cluster in enumerate(km.labels_)}

train['cluster'] = train['num'].map(cluster_dict)
test['cluster'] = test['num'].map(cluster_dict)

df = pd.DataFrame()
for num in range(1, 61):
    st = train[train['num']==num].reset_index(drop=True)
    st = target_process(st)
    df = pd.concat([df, st])
train = df.reset_index(drop=True)

train['region'] = 0 # [1,7,17,20,21,31,34,38,46,47,49,51,52,55,58]
train.loc[train['num'].isin([2,5,10,11,19,28,50]), 'region'] = 1
train.loc[train['num'].isin([12,35,42,48,60]), 'region'] = 2
train.loc[train['num'].isin([22,23,33,40,53]), 'region'] = 3
train.loc[train['num'].isin([44, 57]), 'region'] = 4
train.loc[train['num'].isin([4, 8, 9, 13, 14, 16, 26, 27, 29, 30, 36, 37, 39, 43, 56, 59]), 'region'] = 5
    
test['region'] = 0
test.loc[test['num'].isin([2,5,10,11,19,28,50]), 'region'] = 1
test.loc[test['num'].isin([12,35,42,48,60]), 'region'] = 2
test.loc[test['num'].isin([22,23,33,40,53]), 'region'] = 3
test.loc[test['num'].isin([44, 57]), 'region'] = 4
test.loc[test['num'].isin([4, 8, 9, 13, 14, 16, 26, 27, 29, 30, 36, 37, 39, 43, 56, 59]), 'region'] = 5

In [None]:
def hour_split(st, sub_test):
    if subset=='afternoon':
        st = st[(st['hour']>12)].reset_index(drop=True)
        sub_test = sub_test[sub_test['hour']>12].reset_index(drop=True)
    elif subset=='morning':
        st = st[(st['hour']<13)].reset_index(drop=True)
        sub_test = sub_test[sub_test['hour']<13].reset_index(drop=True)
    elif subset=='working':
        st = st[(st['hour']>=8)&(st['hour']<20)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=8)&(sub_test['hour']<20)].reset_index(drop=True)
    elif subset=='noworking':
        st = st[(st['hour']<8)|(st['hour']>=20)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']<8)|(sub_test['hour']>=20)].reset_index(drop=True)
    elif subset=='q1':
        st = st[(st['hour']>=0)                  &(st['hour']<7)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=0)&(sub_test['hour']<7)].reset_index(drop=True)
    elif subset=='q2':
        st = st[(st['hour']>=7)                  &(st['hour']<13)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=7)&(sub_test['hour']<13)].reset_index(drop=True)
    elif subset=='q3':
        st = st[(st['hour']>=13)                  &(st['hour']<19)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=13)&(sub_test['hour']<19)].reset_index(drop=True)
    elif subset=='q4':
        st =             st[st['hour']>=19].reset_index(drop=True)
        sub_test = sub_test[sub_test['hour']>=19].reset_index(drop=True)
    elif subset=='s1':
        st = st[(st['hour']>=0)                  &(st['hour']<5)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=0)&(sub_test['hour']<5)].reset_index(drop=True)
    elif subset=='s2':
        st = st[(st['hour']>=5)                  &(st['hour']<9)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=5)&(sub_test['hour']<9)].reset_index(drop=True)
    elif subset=='s3':
        st = st[(st['hour']>=9)                  &(st['hour']<13)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=9)&(sub_test['hour']<13)].reset_index(drop=True)
    elif subset=='s4':
        st = st[(st['hour']>=13)                  &(st['hour']<17)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=13)&(sub_test['hour']<17)].reset_index(drop=True)
    elif subset=='s5':
        st = st[(st['hour']>=17)                  &(st['hour']<21)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=17)&(sub_test['hour']<21)].reset_index(drop=True)
    elif subset=='s6':
        st =             st[st['hour']>=21].reset_index(drop=True)
        sub_test = sub_test[sub_test['hour']>=21].reset_index(drop=True)
    elif subset=='e1':
        st = st[(st['hour']>=0)                  &(st['hour']<4)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=0)&(sub_test['hour']<4)].reset_index(drop=True)
    elif subset=='e2':
        st = st[(st['hour']>=4)                  &(st['hour']<7)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=4)&(sub_test['hour']<7)].reset_index(drop=True)
    elif subset=='e3':
        st = st[(st['hour']>=7)                  &(st['hour']<10)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=7)&(sub_test['hour']<10)].reset_index(drop=True)
    elif subset=='e4':
        st = st[(st['hour']>=10)                  &(st['hour']<13)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=10)&(sub_test['hour']<13)].reset_index(drop=True)
    elif subset=='e5':
        st = st[(st['hour']>=13)                  &(st['hour']<16)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=13)&(sub_test['hour']<16)].reset_index(drop=True)
    elif subset=='e6':
        st = st[(st['hour']>=16)                  &(st['hour']<19)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=16)&(sub_test['hour']<19)].reset_index(drop=True)
    elif subset=='e7':
        st = st[(st['hour']>=19)                  &(st['hour']<22)].reset_index(drop=True)
        sub_test = sub_test[(sub_test['hour']>=19)&(sub_test['hour']<22)].reset_index(drop=True)
    elif subset=='e8':
        st =             st[st['hour']>=22].reset_index(drop=True)
        sub_test = sub_test[sub_test['hour']>=22].reset_index(drop=True)
        
    return st, sub_test

In [None]:
transform = {}
train2 = {}
test2 = {}
train_cluster2 = {}
test_cluster2 = {}
transform_cluster = {}
train_region2 = {}
test_region2 = {}
transform_region = {}
drop_cols = ['use_electric', 'date', 'date2', 'wind_speed', 'precipitation', 'sunshine', 'operation', 'solar_power', 
             'week', 'weekend', 'weekend_holiday', 'temperature_F', 'temperature2', 'holiday', 'fold_num', 'hour2', 'cluster', 'oof_lgb', 'region']
subsets = ['all', 'afternoon', 'morning', 'working', 'noworking', 'q1', 'q2', 'q3', 'q4', 's1', 's2', 's3', 's4', 's5', 's6', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8']

In [None]:
# all
for subset in subsets:
    temp, temp2 = pd.DataFrame(), pd.DataFrame()
    temp_transform = {}
    for num in range(1, 61):
        st = train[train['num']==num].reset_index(drop=True)
        sub_test = test[test['num']==num].reset_index(drop=True)
        
        st, sub_test = hour_split(st, sub_test)
        
            
        temp_transform[num] = MinMaxScaler()
        temp_transform[num].fit(st[['use_electric']])
        st['use_electric2'] = temp_transform[num].transform(st[['use_electric']])

        new_col = 'mean_use_electric' 
        st[new_col] = st['weekend'].astype(str) + '_' + st['hour'].astype(str)
        sub_test[new_col] = sub_test['weekend'].astype(str) + '_' + sub_test['hour'].astype(str)
        sub_test[new_col] = sub_test[new_col].map(st.groupby(new_col)['use_electric'].mean())
        st[new_col] = st[new_col].map(st.groupby(new_col)['use_electric'].mean())

        st['fold_num'] = (st['day']//7)    
        temp = pd.concat([temp, st])
        temp2 = pd.concat([temp2, sub_test])

    transform[subset] = temp_transform
    train2[subset] = temp.reset_index(drop=True)
    test2[subset] = temp2.reset_index(drop=True)

# cluster
for subset in subsets:
    temp_train = {}
    temp_test = {}
    temp_transform = {}
    for c in range(5):
        temp, temp2 = pd.DataFrame(), pd.DataFrame()
        for num in np.unique(train.loc[train['cluster']==c, 'num']):
            st = train[train['num']==num].reset_index(drop=True)
            sub_test = test[test['num']==num].reset_index(drop=True)
            st, sub_test = hour_split(st, sub_test)
                
            temp_transform[num] = MinMaxScaler()
            temp_transform[num].fit(st[['use_electric']])
            st['use_electric2'] = temp_transform[num].transform(st[['use_electric']])

            new_col = 'mean_use_electric' 
            st[new_col] = st['weekend'].astype(str) + '_' + st['hour'].astype(str)
            sub_test[new_col] = sub_test['weekend'].astype(str) + '_' + sub_test['hour'].astype(str)
            sub_test[new_col] = sub_test[new_col].map(st.groupby(new_col)['use_electric'].mean())
            st[new_col] = st[new_col].map(st.groupby(new_col)['use_electric'].mean())

            st['fold_num'] = (st['day']//7)    
            temp = pd.concat([temp, st])
            temp2 = pd.concat([temp2, sub_test])

        temp_train[c] = temp.reset_index(drop=True)
        temp_test[c] = temp2.reset_index(drop=True)
    train_cluster2[subset] = temp_train
    test_cluster2[subset] = temp_test
    transform_cluster[subset] = temp_transform
    
# region
for subset in subsets:
    temp_train = {}
    temp_test = {}
    temp_transform = {}
    for c in range(6):
        temp, temp2 = pd.DataFrame(), pd.DataFrame()
        for num in np.unique(train.loc[train['region']==c, 'num']):
            st = train[train['num']==num].reset_index(drop=True)
            sub_test = test[test['num']==num].reset_index(drop=True)
            st, sub_test = hour_split(st, sub_test)

            temp_transform[num] = MinMaxScaler()
            temp_transform[num].fit(st[['use_electric']])
            st['use_electric2'] = temp_transform[num].transform(st[['use_electric']])

            new_col = 'mean_use_electric' 
            st[new_col] = st['weekend'].astype(str) + '_' + st['hour'].astype(str)
            sub_test[new_col] = sub_test['weekend'].astype(str) + '_' + sub_test['hour'].astype(str)
            sub_test[new_col] = sub_test[new_col].map(st.groupby(new_col)['use_electric'].mean())
            st[new_col] = st[new_col].map(st.groupby(new_col)['use_electric'].mean())

            st['fold_num'] = (st['day']//7)    
            temp = pd.concat([temp, st])
            temp2 = pd.concat([temp2, sub_test])

        temp_train[c] = temp.reset_index(drop=True)
        temp_test[c] = temp2.reset_index(drop=True)
    train_region2[subset] = temp_train
    test_region2[subset] = temp_test
    transform_region[subset] = temp_transform

## ALL

In [None]:
for subset in subsets:
    print(subset)
    train2[subset]['oof_lgb'], test2[subset]['oof_lgb'], _ = lgb_models(train2[subset], test2[subset], drop_col=drop_cols, 
                                                                           target='use_electric2', 
                                                                           category_features=['num', 'month', 'THI', 'weekend2', ],
                                                                           learn='lgb', v=0, exp=False)

In [None]:
for col in ['oof_lgb']:
    for subset in subsets:
        train2[subset][col+'2'] = np.concatenate([transform[subset][num].inverse_transform(train2[subset].loc[train2[subset]['num']==num, [col]]) for num in range(1, 61)])
        test2[subset][col+'2'] = np.concatenate([transform[subset][num].inverse_transform(test2[subset].loc[test2[subset]['num']==num, [col]]) for num in range(1, 61)])
        
for df in [train2, test2]:
    df['afternoon_morning'] = pd.concat([df['afternoon'], df['morning']]).sort_values(['num', 'date']).reset_index(drop=True)
    df['working_noworking'] = pd.concat([df['working'], df['noworking']]).sort_values(['num', 'date']).reset_index(drop=True)
    df['q1234'] = pd.concat([df['q1'], df['q2'], df['q3'], df['q4']]).sort_values(['num', 'date']).reset_index(drop=True)
    df['s123456'] = pd.concat([df['s1'], df['s2'], df['s3'], df['s4'], df['s5'], df['s6']]).sort_values(['num', 'date']).reset_index(drop=True)
    df['e12345678'] = pd.concat([df['e1'], df['e2'], df['e3'], df['e4'], df['e5'], df['e6'], df['e7'], df['e8']]).sort_values(['num', 'date']).reset_index(drop=True)

In [None]:
for subset in ['all', 'afternoon_morning', 'working_noworking', 'q1234', 's123456', 'e12345678']:
    print(smape(train2[subset]['oof_lgb2'], train['use_electric']))

smape(np.mean([train2['all']['oof_lgb2'],
                train2['afternoon_morning']['oof_lgb2'],
                train2['working_noworking']['oof_lgb2'],
                train2['q1234']['oof_lgb2'],
                train2['s123456']['oof_lgb2'],
                train2['e12345678']['oof_lgb2']], 0), train['use_electric'])

In [None]:
# 4.854304032291577
# 4.888289768844324
# 4.840717546803809
# 4.878037671834281
# 4.877795888635189
# 4.905936332216347
# 4.666819341958758

## cluster

In [None]:
for subset in subsets:
    print(subset)
    for c in range(5):
        train_cluster2[subset][c]['oof_lgb'], test_cluster2[subset][c]['oof_lgb'], _ = lgb_models(train_cluster2[subset][c], test_cluster2[subset][c], drop_col=drop_cols, 
                                                                                                   target='use_electric2', 
                                                                                                   category_features=['num', 'month', 'THI', 'weekend2', ],
                                                                                                   learn='lgb', v=0, exp=False)

In [None]:
for subset in subsets:
    train_cluster2[subset] = pd.concat(train_cluster2[subset].values()).sort_values(['num', 'date']).reset_index(drop=True)
    test_cluster2[subset] = pd.concat(test_cluster2[subset].values()).sort_values(['num', 'date']).reset_index(drop=True)

for col in ['oof_lgb']:
    for subset in subsets:
        train_cluster2[subset][col+'2'] = np.concatenate([transform_cluster[subset][num].inverse_transform(train_cluster2[subset].loc[train_cluster2[subset]['num']==num, [col]]) for num in range(1, 61)])
        test_cluster2[subset][col+'2'] = np.concatenate([transform_cluster[subset][num].inverse_transform(test_cluster2[subset].loc[test_cluster2[subset]['num']==num, [col]]) for num in range(1, 61)])

for df in [train_cluster2, test_cluster2]:
    df['afternoon_morning'] = pd.concat([df['afternoon'], df['morning']]).sort_values(['num', 'date']).reset_index(drop=True)
    df['working_noworking'] = pd.concat([df['working'], df['noworking']]).sort_values(['num', 'date']).reset_index(drop=True)
    df['q1234'] = pd.concat([df['q1'], df['q2'], df['q3'], df['q4']]).sort_values(['num', 'date']).reset_index(drop=True)
    df['s123456'] = pd.concat([df['s1'], df['s2'], df['s3'], df['s4'], df['s5'], df['s6']]).sort_values(['num', 'date']).reset_index(drop=True)
    df['e12345678'] = pd.concat([df['e1'], df['e2'], df['e3'], df['e4'], df['e5'], df['e6'], df['e7'], df['e8']]).sort_values(['num', 'date']).reset_index(drop=True)

In [None]:
for subset in ['all', 'afternoon_morning', 'working_noworking', 'q1234', 's123456', 'e12345678']:
    print(smape(train_cluster2[subset]['oof_lgb2'], train['use_electric']))

smape(np.mean([train_cluster2['all']['oof_lgb2'],
                train_cluster2['afternoon_morning']['oof_lgb2'],
                train_cluster2['working_noworking']['oof_lgb2'],
                train_cluster2['q1234']['oof_lgb2'],
                train_cluster2['s123456']['oof_lgb2'],
                train_cluster2['e12345678']['oof_lgb2']], 0), train['use_electric'])

In [None]:
# 4.834800050014874
# 4.876309745723581
# 4.857260584692095
# 4.914418823959304
# 4.9192538915152175
# 4.953230095541329
# 4.699489300011213

## region

In [None]:
for subset in subsets:
    print(subset)
    for c in range(6):
        train_region2[subset][c]['oof_lgb'], test_region2[subset][c]['oof_lgb'], _ = lgb_models(train_region2[subset][c], test_region2[subset][c], drop_col=drop_cols, 
                                                                                               target='use_electric2', 
                                                                                               category_features=['num', 'month', 'THI', 'weekend2', ],
                                                                                               learn='lgb', v=0, exp=False)

In [None]:
for subset in subsets:
    train_region2[subset] = pd.concat(train_region2[subset].values()).sort_values(['num', 'date']).reset_index(drop=True)
    test_region2[subset] = pd.concat(test_region2[subset].values()).sort_values(['num', 'date']).reset_index(drop=True)
    
for col in ['oof_lgb']:
    for subset in subsets:
        train_region2[subset][col+'2'] = np.concatenate([transform_region[subset][num].inverse_transform(train_region2[subset].loc[train_region2[subset]['num']==num, [col]]) for num in range(1, 61)])
        test_region2[subset][col+'2'] = np.concatenate([transform_region[subset][num].inverse_transform(test_region2[subset].loc[test_region2[subset]['num']==num, [col]]) for num in range(1, 61)])
        
for df in [train_region2, test_region2]:
    df['afternoon_morning'] = pd.concat([df['afternoon'], df['morning']]).sort_values(['num', 'date']).reset_index(drop=True)
    df['working_noworking'] = pd.concat([df['working'], df['noworking']]).sort_values(['num', 'date']).reset_index(drop=True)
    df['q1234'] = pd.concat([df['q1'], df['q2'], df['q3'], df['q4']]).sort_values(['num', 'date']).reset_index(drop=True)
    df['s123456'] = pd.concat([df['s1'], df['s2'], df['s3'], df['s4'], df['s5'], df['s6']]).sort_values(['num', 'date']).reset_index(drop=True)
    df['e12345678'] = pd.concat([df['e1'], df['e2'], df['e3'], df['e4'], df['e5'], df['e6'], df['e7'], df['e8']]).sort_values(['num', 'date']).reset_index(drop=True)

In [None]:
for subset in ['all', 'afternoon_morning', 'working_noworking', 'q1234', 's123456', 'e12345678']:
    print(smape(train_region2[subset]['oof_lgb2'], train['use_electric']))

smape(np.mean([train_region2['all']['oof_lgb2'],
                train_region2['afternoon_morning']['oof_lgb2'],
                train_region2['working_noworking']['oof_lgb2'],
                train_region2['q1234']['oof_lgb2'],
                train_region2['s123456']['oof_lgb2'],
                train_region2['e12345678']['oof_lgb2']], 0), train['use_electric'])

In [None]:
# 4.904486164267104
# 4.9381086017006
# 4.8948993577088356
# 4.9598043953166
# 4.9428339864017445
# 4.951370227115714
# 4.756103473189231
# 4.721349836159994

In [None]:
for df in [train2, train_cluster2, train_region2]:
    temp = np.zeros(len(train))
    for subset in ['all', 'afternoon_morning', 'working_noworking', 'q1234', 's123456', 'e12345678']:
        temp += df[subset]['oof_lgb2']
    df['all']['oof_lgb_mean'] = temp/6
    
for df in [test2, test_cluster2, test_region2]:
    temp = np.zeros(len(test))
    for subset in ['all', 'afternoon_morning', 'working_noworking', 'q1234', 's123456', 'e12345678']:
        temp += df[subset]['oof_lgb2']
    df['all']['oof_lgb_mean'] = temp/6

In [None]:
sub = pd.read_csv('energy/sample_submission.csv')
sub['answer_subset'] = ((np.concatenate(list(pred_lgb678.values()))*.7 + np.concatenate(list(pred_xgb678.values()))*.3)*.45 +
                        (np.concatenate(list(pred_lgb78.values()))*.7 + np.concatenate(list(pred_xgb78.values()))*.3)*.45 +
                        (np.concatenate(list(pred_lgb8.values()))*.7 + np.concatenate(list(pred_xgb8.values()))*.3)*.1)

sub['answer_time'] = (test2['all']['oof_lgb_mean']*0.35 + 
                        test_cluster2['all']['oof_lgb_mean']*0.35 + 
                        test_region2['all']['oof_lgb_mean']*0.3)

#### post processing

In [None]:
# subset models
w1 = [1.001, 1.013, 1.001, 0.962, 0.985, 1.025, 1.0, 1.045, 0.994, 1.02 , 
 0.958, 0.983, 1.009, 0.993, 0.991, 0.989, 1.007, 0.997, 1.028, 1.03 , 
 1.029, 1.041, 1.029, 1.009, 1.011, 1.014, 1.046, 0.998, 0.969, 1.02 , 
 1.006, 1.003, 0.999, 0.967, 1.049, 1.03 , 1.006, 1.031, 1.018, 0.965, 
 0.981, 0.964, 1.007, 1.029, 1.008, 0.99 , 0.998, 1.05 , 1.038, 1.0  , 
 0.999, 0.991, 1.015, 1.021, 1.027, 1.038, 1.05 , 0.993, 0.957, 0.996]
w2 = [i if i>1.0 else 1.0 for i in w1]

sub['answer_subset'] = np.expm1(subset_best)
sub['num'] = test['num']
for num in range(1, 61):
    sub.loc[sub['num']==num, 'answer_subset'] = sub.loc[sub['num']==num, 'answer_subset']*w2[num-1]

In [None]:
## time model weight
w1 = [1.0, 1.009, 1.005, 0.981, 0.99, 1.03, 0.989, 1.049, 0.986, 1.01, 0.95,
     0.968, 1.006, 0.984, 0.982, 0.966, 0.996, 0.96, 0.973, 1.004, 1.023,
     1.05, 1.03, 1.0, 0.99, 1.02, 1.034, 0.956, 0.951, 0.997, 1.004, 1.002,
     0.995, 0.95, 1.05, 1.032, 1.001, 1.033, 1.032, 0.956, 0.97, 0.95, 1.0,
     1.033, 0.99, 0.975, 0.992, 1.048, 1.023, 0.994, 0.973, 0.98, 1.014,
     1.025, 1.027, 1.05, 1.05, 0.974, 0.95, 0.977]
w2 = [i+0.02 if i<0.98 else 1.0 for i in w1]
for num in range(1, 61):
    sub.loc[sub['num']==num, 'answer_time'] = sub.loc[sub['num']==num, 'answer_time']*w2[num-1]

In [None]:
sub['answer'] = sub['answer_subset']*.7 + sub['answer_time']*.3
sub[['num_date_time', 'answer']].to_csv('submission.csv', index=False)