In [159]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os
import gc
import tqdm
import datetime
import random
from collections import defaultdict
from sklearn.neural_network import MLPRegressor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit, GroupKFold

from sklearn.ensemble import RandomForestRegressor

# model
import xgboost as xgb
import lightgbm as lgb

# evaluation
from sklearn.metrics import mean_squared_error

# install
# !pip install workalendar
# from workalendar.asia import SouthKorea

Mounted at /content/drive


In [0]:
path = 'drive/My Drive/11dacon/data/'

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    
from numba import jit
import math

@jit
def smape_fast(y_true, y_pred, exp=False):
    
    if exp:
        y_true = np.expm1(np.array(y_true))
        y_pred = np.expm1(np.array(y_pred))
    else:
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        c = a+b
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return out

def rmse(y_true, y_pred, exp=False):
    if exp:
        return np.sqrt(mean_squared_error(np.expm1(y_true), np.expm1(y_pred)))
    else:
        return np.sqrt(mean_squared_error(y_true, y_pred))

In [0]:
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
sub = pd.read_csv(path+'submission.csv')

# holidays = pd.concat([pd.Series(np.array(SouthKorea().holidays(2018))[:, 0]), pd.Series(np.array(SouthKorea().holidays(2017))[:, 0]), pd.Series(np.array(SouthKorea().holidays(2016))[:, 0])]).reset_index(drop=True)

# weather1['일시'] = pd.to_datetime(weather1['일시'])

# weather3 = weather3.iloc[:, [1, 2, 3, 4, 5, 7, 8]]
# weather3.columns = ['일시', '기온', '강수량', '풍속', '습도', '날씨', '전운량']


# weather3['날씨'] = LabelEncoder().fit_transform(weather3['날씨'].fillna('no value'))
# weather3 = weather3.fillna(0)
# weather3['일시'] = pd.to_datetime(weather3['일시'])

In [0]:
def merge(train, col):
    temp = train[['Time', col]].rename(columns={col:'target'})
    temp['month'] = pd.to_datetime(temp['Time']).dt.to_period('m')
    temp['is_null'] = temp['target'].isnull().astype(int)
    temp = temp.groupby('month').sum()
    temp = temp[temp['is_null']<400].reset_index().drop(columns='is_null')
    temp['house'] = int(col[1:])
    
    temp2 = pd.DataFrame(pd.date_range('20180701', '20181201', freq='h'), columns=['Time']).iloc[:-1, :]
    temp2['month'] = pd.to_datetime(temp2['Time']).dt.to_period('m')
    temp2['target'] = 0
    temp2 = temp2.groupby('month').sum().reset_index()
    temp2['house'] = int(col[1:])
    
    return temp, temp2

In [0]:
train_df = pd.DataFrame()
test_df = pd.DataFrame()

for col in sub['meter_id']:
    temp, temp2 = merge(test, col)
    train_df = pd.concat([train_df, temp])
    test_df = pd.concat([test_df, temp2])
    
for col in train.columns[1:]:
    temp, temp2 = merge(train, col)
    train_df = pd.concat([train_df, temp])
    test_df = pd.concat([test_df, temp2])
    
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [0]:
weather1 = pd.read_csv(path + 'weather_month.csv', encoding='cp949').dropna().iloc[:, 1:]
weather1['일시'] = '20' + weather1['일시'].apply(lambda x: x.split('-')[0]) + '-' +weather1['일시'].apply(lambda x: x.split('-')[1]).map({'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12, 'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7}).astype(str)
weather1.columns = ['Time', '평균기온', '평균최고기온', '평균최저기온', '최고기온', '최저기온']
weather1['Time'] = pd.to_datetime(weather1['Time']).dt.to_period('m')
weather1[['평균기온', '평균최고기온', '평균최저기온', '최고기온', '최저기온']] = weather1[['평균기온', '평균최고기온', '평균최저기온', '최고기온', '최저기온']].round()

def preprocess(data):
    data = pd.merge(data, weather1, how='left', left_on='month', right_on='Time').drop(columns='Time')
    data['year'] = data['month'].apply(lambda x: str(x).split('-')[0]).astype(int) #- 2016
    data['month'] = data['month'].apply(lambda x: str(x).split('-')[1]).astype(int) #- 1
    data = data.dropna().reset_index(drop=True)
    return data
train_df = preprocess(train_df)
test_df = preprocess(test_df)

In [0]:
params = {
    'objective':'regression',
    'boosting_type':'gbdt',
    'metric':'rmse',
    'n_jobs':-1,
    'learning_rate':0.1,
#     'num_leaves': 2**8,
    'max_depth':-1,
#     'subsample':0.7,
#     'reg_alpha':0.1,
#     'reg_lambda':0.1,
    'verbose':-1,
    'seed': 42
}

oof = np.zeros(len(train_df))
pred = np.zeros(len(test_df))

feature = [i for i in train_df.columns if i not in ['target', 'Time']]
kf = KFold(n_splits=5, random_state=42, shuffle=True)

for trn_idx, val_idx in kf.split(train_df):
    tt = lgb.Dataset(train_df.loc[trn_idx, feature], train_df.loc[trn_idx, ['target']])
    vv = lgb.Dataset(train_df.loc[val_idx, feature], train_df.loc[val_idx, ['target']])
    
    model = lgb.train(params, tt, valid_sets=[tt, vv], early_stopping_rounds=50, verbose_eval=0)
#     model = RandomForestRegressor(n_estimators=100, random_state=42).fit(train_df.loc[trn_idx, feature], train_df.loc[trn_idx, 'target'])
    
    oof[val_idx] = model.predict(train_df.loc[val_idx, feature])
    pred += model.predict(test_df[feature])/5

In [235]:
smape_fast(train_df['target'], oof), rmse(train_df['target'], oof)

(30.821024568976963, 392.67584500728407)

In [214]:
smape_fast(train_df['target'], oof), rmse(train_df['target'], oof)

(22.925950768639765, 247.23364782198723)

In [0]:
test_df['pred'] = pred

In [237]:
test_df[test_df['house']==5]

Unnamed: 0,month,target,house,평균기온,평균최고기온,평균최저기온,최고기온,최저기온,year,pred
0,7,0,5,27.0,30.0,24.0,36.0,19.0,2018,1543.397911
1,8,0,5,28.0,32.0,25.0,36.0,20.0,2018,1580.225238
2,9,0,5,22.0,25.0,18.0,30.0,12.0,2018,1181.037264
3,10,0,5,14.0,18.0,10.0,25.0,3.0,2018,1168.205003
4,11,0,5,8.0,12.0,5.0,17.0,-1.0,2018,1336.825047


In [238]:
train_df[train_df['house']==5]

Unnamed: 0,month,target,house,평균기온,평균최고기온,평균최저기온,최고기온,최저기온,year
0,5,4117.907,5,17.0,21.0,13.0,28.0,9.0,2018
1,6,4327.646,5,21.0,25.0,18.0,29.0,16.0,2018


In [227]:
test_df[test_df['house']==5]

Unnamed: 0,month,target,house,평균기온,평균최고기온,평균최저기온,최고기온,최저기온,year,pred
0,7,0,5,27.0,30.0,24.0,36.0,19.0,2018,3833.1773
1,8,0,5,28.0,32.0,25.0,36.0,20.0,2018,3849.544716
2,9,0,5,22.0,25.0,18.0,30.0,12.0,2018,3769.651208
3,10,0,5,14.0,18.0,10.0,25.0,3.0,2018,3624.65775
4,11,0,5,8.0,12.0,5.0,17.0,-1.0,2018,3605.634622


In [228]:
train_df[train_df['house']==5]

Unnamed: 0,month,target,house,평균기온,평균최고기온,평균최저기온,최고기온,최저기온,year
0,5,4117.907,5,17.0,21.0,13.0,28.0,9.0,2018
1,6,4327.646,5,21.0,25.0,18.0,29.0,16.0,2018


In [0]:
params = {
    'objective':'reg:squarederror',
    'n_estimators':10000,
    'max_depth':2**3,
    'learning_rate':0.03,
    'n_jobs':-1,
    'seed':42
}

oof = np.zeros(len(train_df))
pred = np.zeros(len(test_df))

feature = [i for i in train_df.columns if i not in ['target', '일시']]
kf = KFold(n_splits=5, random_state=42, shuffle=False)
gkf = GroupKFold(n_splits=5)
best_iterations = []

for trn_idx, val_idx in kf.split(train_df):
# for trn_idx, val_idx in gkf.split(train_df, groups=train_df['house']):
    tt = xgb.DMatrix(train_df.loc[trn_idx, feature], train_df.loc[trn_idx, ['target']])
    vv = xgb.DMatrix(train_df.loc[val_idx, feature], train_df.loc[val_idx, ['target']])
    
    model = xgb.train(params, tt, num_boost_round=5000, evals=[(tt, 'train'), (vv, 'val')], early_stopping_rounds=200, verbose_eval=0)
    
    oof[val_idx] = model.predict(xgb.DMatrix(train_df.loc[val_idx, feature]))
    pred += model.predict(xgb.DMatrix(test_df[feature]))/5
    best_iterations.append(model.best_iteration)
smape_fast(test_df['target'], pred, False), rmse(test_df['target'], pred, False), best_iterations, np.mean(best_iterations)

(3.478222237122217, 0.3353699153334443, [369, 531, 203, 432, 358], 378.6)

In [0]:
# predictions
train_df = pd.concat([train_df, test_df]).reset_index(drop=True)
params = {
    'objective':'reg:squarederror',
    'n_estimators':10000,
    'max_depth':2**3,
    'learning_rate':0.03,
    'n_jobs':-1,
    'seed':42
}

temp = pd.DataFrame(pd.Series(pd.date_range('2018-07-01', '2018-11-01')).dt.to_period('m').unique(), columns=['일시'])
temp = pd.merge(temp, weather1, how='left', on='일시')

test_df = pd.DataFrame()
for col in sub['meter_id'].apply(lambda x: x.split('X')[1]):
    temp['house'] = int(col)
    test_df = pd.concat([test_df, temp])
test_df = pd.merge(test_df, day_range, how='left', on='일시')
test_df['num_day'] = test_df['평일'] + test_df['주말_공휴일']
test_df = pd.merge(test_df, train_df[['house', 'target_mean', 'target_min', 'target_max', 'target_std', 'target_quan1', 'target_quan2', 'target_quan3']].drop_duplicates(), how='left', on='house')

test_df = test_df.reset_index(drop=True)
pred = np.zeros(len(test_df))

for seed in [42, 43, 44, 45, 46]:
    params['seed']=seed
    
    tt = xgb.DMatrix(train_df[feature], train_df[['target']])
    model = xgb.train(params, tt, num_boost_round=400, evals=[(tt, 'train')], verbose_eval=100)
    pred += model.predict(xgb.DMatrix(test_df[feature]))/5
pred_xgb = pred.copy()

[0]	train-rmse:4.77966
[100]	train-rmse:0.267472
[200]	train-rmse:0.080056
[300]	train-rmse:0.061967
[399]	train-rmse:0.052611
[0]	train-rmse:4.77966
[100]	train-rmse:0.267472
[200]	train-rmse:0.080056
[300]	train-rmse:0.061967
[399]	train-rmse:0.052611
[0]	train-rmse:4.77966
[100]	train-rmse:0.267472
[200]	train-rmse:0.080056
[300]	train-rmse:0.061967
[399]	train-rmse:0.052611
[0]	train-rmse:4.77966
[100]	train-rmse:0.267472
[200]	train-rmse:0.080056
[300]	train-rmse:0.061967
[399]	train-rmse:0.052611
[0]	train-rmse:4.77966
[100]	train-rmse:0.267472
[200]	train-rmse:0.080056
[300]	train-rmse:0.061967
[399]	train-rmse:0.052611


In [0]:
params = {
    'objective':'regression',
    'boosting_type':'gbdt',
    'metric':'rmse',
    'n_jobs':-1,
    'learning_rate':0.03,
    'num_leaves': 2**8,
    'max_depth':-1,
    'subsample':0.7,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'n_estimators':10000,
    'max_bin':255,
    'verbose':-1,
    'seed': 42,
    'early_stopping_rounds':100
}

oof = np.zeros(len(train_df))
pred = np.zeros(len(test_df))

feature = [i for i in train_df.columns if i not in ['target', '일시']]
kf = KFold(n_splits=5, random_state=42, shuffle=False)
gkf = GroupKFold(n_splits=5)
best_iterations = []

for trn_idx, val_idx in kf.split(train_df):
# for trn_idx, val_idx in gkf.split(train_df, groups=train_df['house']):
    tt = lgb.Dataset(train_df.loc[trn_idx, feature], train_df.loc[trn_idx, ['target']])
    vv = lgb.Dataset(train_df.loc[val_idx, feature], train_df.loc[val_idx, ['target']])
    
    model = lgb.train(params, tt, valid_sets=[tt, vv], early_stopping_rounds=200, verbose_eval=0)
    
    pred += model.predict(test_df[feature])/5
    best_iterations.append(model.best_iteration)
    
smape_fast(test_df['target'], pred, False), rmse(test_df['target'], pred, False), best_iterations, np.mean(best_iterations)



(3.479454872987066, 0.3540846970206761, [280, 928, 190, 493, 164], 411.0)

In [0]:
# predictions
train_df = pd.concat([train_df, test_df]).reset_index(drop=True)
params = {
    'objective':'regression',
    'boosting_type':'gbdt',
    'metric':'rmse',
    'n_jobs':-1,
    'learning_rate':0.03,
    'num_leaves': 2**8,
    'max_depth':-1,
    'subsample':0.7,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'n_estimators':450,
    'max_bin':255,
    'verbose':-1,
    'seed': 42,
    'early_stopping_rounds':100
}

temp = pd.DataFrame(pd.Series(pd.date_range('2018-07-01', '2018-11-01')).dt.to_period('m').unique(), columns=['일시'])
temp = pd.merge(temp, weather1, how='left', on='일시')

test_df = pd.DataFrame()
for col in sub['meter_id'].apply(lambda x: x.split('X')[1]):
    temp['house'] = int(col)
    test_df = pd.concat([test_df, temp])
test_df = pd.merge(test_df, day_range, how='left', on='일시')
test_df['num_day'] = test_df['평일'] + test_df['주말_공휴일']
test_df = pd.merge(test_df, train_df[['house', 'target_mean', 'target_min', 'target_max', 'target_std', 'target_quan1', 'target_quan2', 'target_quan3']].drop_duplicates(), how='left', on='house')

test_df = test_df.reset_index(drop=True)

pred = np.zeros(len(test_df))

for seed in [42, 43, 44, 45, 46]:
    params['seed']=seed
    
    tt = lgb.Dataset(train_df[feature], train_df[['target']])
    model = lgb.train(params, tt, valid_sets=[tt], verbose_eval=300)
    pred += model.predict(test_df[feature])/5
pred_lgb = pred.copy()



Training until validation scores don't improve for 100 rounds.
[300]	training's rmse: 0.149892
Did not meet early stopping. Best iteration is:
[450]	training's rmse: 0.134479
Training until validation scores don't improve for 100 rounds.
[300]	training's rmse: 0.149892
Did not meet early stopping. Best iteration is:
[450]	training's rmse: 0.134479
Training until validation scores don't improve for 100 rounds.
[300]	training's rmse: 0.149892
Did not meet early stopping. Best iteration is:
[450]	training's rmse: 0.134479
Training until validation scores don't improve for 100 rounds.
[300]	training's rmse: 0.149892
Did not meet early stopping. Best iteration is:
[450]	training's rmse: 0.134479
Training until validation scores don't improve for 100 rounds.
[300]	training's rmse: 0.149892
Did not meet early stopping. Best iteration is:
[450]	training's rmse: 0.134479


In [0]:
pred = pred_lgb*0.5 + pred_xgb*0.5
test_df['pred'] = np.expm1(pred)

In [0]:
test_df = test_df.drop_duplicates().reset_index(drop=True)
sub_df = test_df.groupby(['house', '일시'])['pred'].sum().unstack().reset_index()

In [0]:
from google.colab import drive
drive.mount('/content/drive')
path = 'drive/My Drive/11dacon/submit/'

sub_df.to_csv(path+'month_predicion_xgb_lgb.csv', index=False)

Mounted at /content/drive
