In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os
import gc
import tqdm
import datetime
import random
from collections import defaultdict
from sklearn.neural_network import MLPRegressor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit, GroupKFold

# model
import xgboost as xgb
import lightgbm as lgb

# evaluation
from sklearn.metrics import mean_squared_error

# install
!pip install workalendar
from workalendar.asia import SouthKorea

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
Collecting workalendar
[?25l  Downloading https://files.pythonhosted.org/packages/53/3b/0674dab5f7b9878c4907ad9f833575fc23c58616c126c65cd21b9fd2bedb/workalendar-7.0.0-py3-none-any.whl (159kB)
[K     |████████████████████████████████| 163kB 8.8MB/s 
Collecting skyfield (from workalendar)
[?25l  Downloading https://files.pythonhosted.org/packages/7c/9c/4a9879460dddac5bda8d7e8b8eb6159093d2b285077d085ff78d4f02a2bc/skyfield-1.13.tar.gz (224kB)
[K     |█████████

In [2]:
from google.colab import drive
drive.mount('/content/drive')
path = 'drive/My Drive/11dacon/data/'

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    
from numba import jit
import math

@jit
def smape_fast(y_true, y_pred, exp=True):
    
    if exp:
        y_true = np.expm1(np.array(y_true))
        y_pred = np.expm1(np.array(y_pred))
    else:
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        c = a+b
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return out

def rmse(y_true, y_pred, exp=True):
    if exp:
        return np.sqrt(mean_squared_error(np.expm1(y_true), np.expm1(y_pred)))
    else:
        return np.sqrt(mean_squared_error(y_true, y_pred))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
sub = pd.read_csv(path+'submission.csv')

holidays = pd.concat([pd.Series(np.array(SouthKorea().holidays(2018))[:, 0]), pd.Series(np.array(SouthKorea().holidays(2017))[:, 0]), pd.Series(np.array(SouthKorea().holidays(2016))[:, 0])]).reset_index(drop=True)

weather2 = pd.read_csv(path + 'weather_day.csv', encoding='cp949').dropna().iloc[:, 1:]
weather2.columns = ['일시', '평균기온', '최저기온', '최고기온']
weather2['일시'] = pd.to_datetime(weather2['일시'])

In [0]:
def merge(train):
    train_df = pd.DataFrame()
    for col in train.columns[1:]:
        temp = train[['Time', col]].dropna().rename(columns={col:'target'})
        temp['Time'] = pd.to_datetime(pd.to_datetime(temp['Time']).dt.date)
        temp = temp.groupby('Time').sum().reset_index()
        temp['house'] = int(col.replace('X', ''))
        temp = temp[temp['Time']>'2017-12-31'].reset_index(drop=True)

        train_df = pd.concat([train_df, temp]).reset_index(drop=True)

    train_df.rename(columns={'Time':'일시'}, inplace=True)
    
    train_df = pd.merge(train_df, weather2, how='left', on='일시')
    return train_df

In [0]:
train_df2 = merge(train)
train_df = merge(test)
train_df = pd.concat([train_df, train_df2]).reset_index(drop=True)

In [0]:
day_range = pd.DataFrame(pd.date_range('2018-02-01', '2018-11-30'), columns=['day'])
day_range['holiday'] = day_range['day'].isin(holidays).astype(int)
day_range['weekday'] = pd.to_datetime(day_range['day']).dt.weekday.map({0:0, 1:0, 2:0, 3:0, 4:0, 5:1, 6:1})
day_range['weekday_holiday'] = (day_range['holiday'] + day_range['weekday']).map({0:0, 1:1, 2:1})
day_range = day_range[['day', 'weekday_holiday']]
day_range.columns = ['일시', '주말_공휴일']

train_df = pd.merge(train_df, day_range, how='left', on='일시')

In [0]:
train_df['target_mean'] = train_df['house'].map(train_df.groupby('house')['target'].mean())
train_df['target_min'] = train_df['house'].map(train_df.groupby('house')['target'].min())
train_df['target_max'] = train_df['house'].map(train_df.groupby('house')['target'].max())
train_df['target_std'] = train_df['house'].map(train_df.groupby('house')['target'].std())
train_df['target_quan1'] = train_df['house'].map(train_df.groupby('house')['target'].quantile(.25))
train_df['target_quan2'] = train_df['house'].map(train_df.groupby('house')['target'].quantile(.5))
train_df['target_quan3'] = train_df['house'].map(train_df.groupby('house')['target'].quantile(.75))

In [0]:
train_df['target'] = np.log1p(train_df['target'])

days = ['2018-06-21', '2018-06-22' '2018-06-23', '2018-06-24', '2018-06-25', '2018-06-26', '2018-06-27', '2018-06-28', '2018-06-29', '2018-06-30']

test_df = train_df[train_df['일시'].astype(str).isin(days)].reset_index(drop=True)
train_df = train_df[~train_df['일시'].astype(str).isin(days)].reset_index(drop=True)

In [40]:
params = {
    'objective':'reg:squarederror',
    'n_estimators':5000,
    'max_depth':2**4,
    'learning_rate':0.03,
    'lambda':0.1,
    'alpha':0.1,
    'subsample':0.7,
    'n_jobs':-1,
    'seed':42
}

oof = np.zeros(len(train_df))
pred = np.zeros(len(test_df))

feature = [i for i in train_df.columns if i not in ['target', '일시']]
kf = KFold(n_splits=5, random_state=42, shuffle=False)
gkf = GroupKFold(n_splits=5)
best_iterations = []

for trn_idx, val_idx in kf.split(train_df):
# for trn_idx, val_idx in gkf.split(train_df, groups=train_df['house']):
    tt = xgb.DMatrix(train_df.loc[trn_idx, feature], train_df.loc[trn_idx, ['target']])
    vv = xgb.DMatrix(train_df.loc[val_idx, feature], train_df.loc[val_idx, ['target']])
    
    model = xgb.train(params, tt, num_boost_round=5000, evals=[(tt, 'train'), (vv, 'val')], early_stopping_rounds=200, verbose_eval=0)
    
    oof[val_idx] = model.predict(xgb.DMatrix(train_df.loc[val_idx, feature]))
    pred += model.predict(xgb.DMatrix(test_df[feature]))/5
    best_iterations.append(model.best_iteration)
    print('HH')
smape_fast(test_df['target'], pred, True), rmse(test_df['target'], pred, True), best_iterations, np.mean(best_iterations)

(20.90361993716017, 5.343401838404966, [189, 261, 210, 214, 217], 218.2)

In [42]:
# predictions
train_df = pd.concat([train_df, test_df]).reset_index(drop=True)

temp = pd.DataFrame(pd.date_range('2018-07-01', '2018-07-10'), columns=['일시'])
temp = pd.merge(temp, weather2, how='left', on='일시')

test_df = pd.DataFrame()
for col in sub['meter_id'].apply(lambda x: x.split('X')[1]):
    temp['house'] = int(col)
    test_df = pd.concat([test_df, temp])
test_df = pd.merge(test_df, day_range, how='left', on='일시')
test_df = pd.merge(test_df, train_df[['house', 'target_mean', 'target_min', 'target_max', 'target_std', 'target_quan1', 'target_quan2', 'target_quan3']].drop_duplicates(), how='left', on='house')

test_df = test_df.reset_index(drop=True)
pred = np.zeros(len(test_df))

for seed in [42, 43, 44, 45, 46]:
    params['seed']=seed
    
    tt = xgb.DMatrix(train_df[feature], train_df[['target']])
    model = xgb.train(params, tt, num_boost_round=225, evals=[(tt, 'train')], verbose_eval=300)
    pred += model.predict(xgb.DMatrix(test_df[feature]))/5
pred_xgb = pred.copy()

[0]	train-rmse:1.73329
[224]	train-rmse:0.114554
[0]	train-rmse:1.73328
[224]	train-rmse:0.114881
[0]	train-rmse:1.7333
[224]	train-rmse:0.114618
[0]	train-rmse:1.73331
[224]	train-rmse:0.114777
[0]	train-rmse:1.7333
[224]	train-rmse:0.114608


In [26]:
params = {
    'objective':'regression',
    'boosting_type':'gbdt',
    'metric':'rmse',
    'n_jobs':-1,
    'learning_rate':0.03,
    'num_leaves': 2**8,
    'max_depth':-1,
    'subsample':0.7,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'n_estimators':10000,
    'max_bin':255,
    'verbose':-1,
    'seed': 42,
    'early_stopping_rounds':100
}

oof = np.zeros(len(train_df))
pred = np.zeros(len(test_df))

feature = [i for i in train_df.columns if i not in ['target', '일시']]
kf = KFold(n_splits=5, random_state=42, shuffle=False)
gkf = GroupKFold(n_splits=5)
best_iterations = []

# for trn_idx, val_idx in kf.split(train_df):
for trn_idx, val_idx in gkf.split(train_df, groups=train_df['house']):
    tt = lgb.Dataset(train_df.loc[trn_idx, feature], train_df.loc[trn_idx, ['target']])
    vv = lgb.Dataset(train_df.loc[val_idx, feature], train_df.loc[val_idx, ['target']])
    
    model = lgb.train(params, tt, valid_sets=[tt, vv], early_stopping_rounds=200, verbose_eval=0)
    
    pred += model.predict(test_df[feature])/5
    best_iterations.append(model.best_iteration)
    
smape_fast(test_df['target'], pred, True), rmse(test_df['target'], pred, True), best_iterations, np.mean(best_iterations)



(23.854778509943845, 5.374980602007869, [471, 564, 242, 453, 450], 436.0)

In [27]:
# predictions
train_df = pd.concat([train_df, test_df]).reset_index(drop=True)

temp = pd.DataFrame(pd.date_range('2018-07-01', '2018-07-10'), columns=['일시'])
temp = pd.merge(temp, weather2, how='left', on='일시')

test_df = pd.DataFrame()
for col in sub['meter_id'].apply(lambda x: x.split('X')[1]):
    temp['house'] = int(col)
    test_df = pd.concat([test_df, temp])
test_df = pd.merge(test_df, day_range, how='left', on='일시')
test_df = pd.merge(test_df, train_df[['house', 'target_mean', 'target_min', 'target_max', 'target_std', 'target_quan1', 'target_quan2', 'target_quan3']].drop_duplicates(), how='left', on='house')

test_df = test_df.reset_index(drop=True)
pred = np.zeros(len(test_df))

for seed in [42, 43, 44, 45, 46]:
    params['seed']=seed
    
    tt = lgb.Dataset(train_df[feature], train_df[['target']])
    model = lgb.train(params, tt, valid_sets=[tt], verbose_eval=200)
    pred += model.predict(test_df[feature])/5
pred_lgb = pred.copy()



Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 0.20493
[400]	training's rmse: 0.189121
Did not meet early stopping. Best iteration is:
[450]	training's rmse: 0.186436
Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 0.204923
[400]	training's rmse: 0.188929
Did not meet early stopping. Best iteration is:
[450]	training's rmse: 0.186421
Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 0.204813
[400]	training's rmse: 0.189373
Did not meet early stopping. Best iteration is:
[450]	training's rmse: 0.186928
Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 0.204817
[400]	training's rmse: 0.189468
Did not meet early stopping. Best iteration is:
[450]	training's rmse: 0.186963
Training until validation scores don't improve for 100 rounds.
[200]	training's rmse: 0.204712
[400]	training's rmse: 0.189248
Did not meet early stopping. Best iteration is

In [0]:
pred = pred_xgb*0.5 + pred_lgb*0.5
test_df['pred'] = np.expm1(pred)

In [0]:
test_df = test_df.drop_duplicates().reset_index()
sub_df = test_df.groupby(['house', '일시'])['pred'].sum().unstack().reset_index()

In [48]:
from google.colab import drive
drive.mount('/content/drive')
path = 'drive/My Drive/11dacon/submit/'

sub_df.to_csv(path+'day_predicion_xgb_lgb.csv', index=False)

Mounted at /content/drive
