In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os
import gc
import tqdm
import datetime
import random
from collections import defaultdict
from sklearn.neural_network import MLPRegressor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit, GroupKFold

# model
import xgboost as xgb
import lightgbm as lgb

# evaluation
from sklearn.metrics import mean_squared_error

# install
!pip install workalendar
from workalendar.asia import SouthKorea

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
Collecting workalendar
[?25l  Downloading https://files.pythonhosted.org/packages/53/3b/0674dab5f7b9878c4907ad9f833575fc23c58616c126c65cd21b9fd2bedb/workalendar-7.0.0-py3-none-any.whl (159kB)
[K     |████████████████████████████████| 163kB 2.7MB/s 
Collecting skyfield-data (from workalendar)
[?25l  Downloading https://files.pythonhosted.org/packages/ea/46/666a4b44709badf6e11b8b77a7aeefebababc1648f46a893f9f8642e99b3/skyfield_data-0.1.0-py2.py3-none-any.whl (

In [2]:
from google.colab import drive
drive.mount('/content/drive')
path = 'drive/My Drive/11dacon/data/'

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    
from numba import jit
import math

@jit
def smape_fast(y_true, y_pred, exp=True):
    
    if exp:
        y_true = np.expm1(np.array(y_true))
        y_pred = np.expm1(np.array(y_pred))
    else:
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        c = a+b
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return out

def rmse(y_true, y_pred, exp=True):
    if exp:
        return np.sqrt(mean_squared_error(np.expm1(y_true), np.expm1(y_pred)))
    else:
        return np.sqrt(mean_squared_error(y_true, y_pred))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
sub = pd.read_csv(path+'submission.csv')

holidays = pd.concat([pd.Series(np.array(SouthKorea().holidays(2018))[:, 0]), pd.Series(np.array(SouthKorea().holidays(2017))[:, 0]), pd.Series(np.array(SouthKorea().holidays(2016))[:, 0])]).reset_index(drop=True)

weather3 = pd.read_csv(path+'weather_hour.csv', encoding='cp949').iloc[:, [1, 2, 3, 4, 5, 7, 8]]
weather3.columns = ['일시', '기온', '강수량', '풍속', '습도', '날씨', '전운량']

weather3['날씨'] = LabelEncoder().fit_transform(weather3['날씨'].fillna('no value'))
weather3 = weather3.fillna(0)
weather3['일시'] = pd.to_datetime(weather3['일시'])

In [0]:
def merge(train):
    train_df = pd.DataFrame()
    for col in train.columns[1:]:
        temp = train[['Time', col]].dropna().rename(columns={col:'target'})
        temp['house'] = int(col.replace('X', ''))
        temp = temp[temp['Time']>='2018-01-01'].reset_index(drop=True)

        temp['Time'] = pd.to_datetime(temp['Time'])
        temp['date'] = temp['Time'].dt.date
        temp['holiday'] = temp['date'].isin(holidays).astype(int)
        temp['week'] = temp['Time'].dt.week
        temp['weekday'] = temp['Time'].dt.weekday
        temp['hour'] = temp['Time'].dt.hour
        temp['working_hour'] = temp['hour'].map({0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 
                                                        9:1, 10:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:1, 17:1, 
                                                        18:1, 19:1, 20:1, 21:0, 22:0, 23:0, 24:0})
        temp['is_weekend'] = temp['Time'].dt.weekday.map({0:0, 1:0, 2:0, 3:0, 4:0, 5:1, 6:1})
        temp['is_weekend_holiday'] = (temp['is_weekend']+temp['holiday']).map({0:0, 1:1, 2:1})
        
        temp['target_mean'] = temp['target'].mean()
        temp['target_min'] = temp['target'].min()
        temp['target_max'] = temp['target'].max()
        temp['target_std'] = temp['target'].std()
        temp['target_quan1'] = temp['target'].quantile(.25)
        temp['target_quan2'] = temp['target'].quantile(.5)
        temp['target_quan3'] = temp['target'].quantile(.75)
        temp['target_quan4'] = temp['target'].quantile(.125)
        temp['target_quan5'] = temp['target'].quantile(.375)
        temp['target_quan6'] = temp['target'].quantile(.675)
        temp['target_quan7'] = temp['target'].quantile(.925)
        
        
        temp_week = temp.groupby('week')['target'].mean().reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        temp['prior_target_mean'] = temp['week'].map(temp_week)
        temp_week = temp.groupby('week')['target'].min().reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        temp['prior_target_min'] = temp['week'].map(temp_week)
        temp_week = temp.groupby('week')['target'].max().reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        temp['prior_target_max'] = temp['week'].map(temp_week)
        temp_week = temp.groupby('week')['target'].std().reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        temp['prior_target_std'] = temp['week'].map(temp_week)
        temp_week = temp.groupby('week')['target'].quantile(.25).reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        temp['prior_target_quan1'] = temp['week'].map(temp_week)
        temp_week = temp.groupby('week')['target'].quantile(.5).reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        temp['prior_target_quan2'] = temp['week'].map(temp_week)
        temp_week = temp.groupby('week')['target'].quantile(.75).reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        temp['prior_target_quan3'] = temp['week'].map(temp_week)
        
        temp_week = temp.groupby('month')['target'].mean().reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        temp['prior_target_mean_month'] = temp['month'].map(temp_week)
        temp_week = temp.groupby('month')['target'].min().reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        temp['prior_target_min_month'] = temp['month'].map(temp_week)
        temp_week = temp.groupby('month')['target'].max().reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        temp['prior_target_max_month'] = temp['month'].map(temp_week)
        temp_week = temp.groupby('month')['target'].std().reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        temp['prior_target_std_month'] = temp['month'].map(temp_week)
        temp_week = temp.groupby('month')['target'].quantile(.25).reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        temp['prior_target_quan1_month'] = temp['month'].map(temp_week)
        temp_week = temp.groupby('month')['target'].quantile(.5).reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        temp['prior_target_quan2_month'] = temp['month'].map(temp_week)
        temp_week = temp.groupby('month')['target'].quantile(.75).reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        temp['prior_target_quan3_month'] = temp['month'].map(temp_week)
        
        temp = temp.dropna()
        
        train_df = pd.concat([train_df, temp]).reset_index(drop=True)

    train_df.rename(columns={'Time':'일시'}, inplace=True)
    train_df = pd.merge(train_df, weather3, how='left', on='일시')
    
    return train_df

In [0]:
train_df2 = merge(train)
train_df = merge(test)

train_df = pd.concat([train_df, train_df2]).reset_index(drop=True)

train_df['target'] = np.log1p(train_df['target'])

test_df = train_df[train_df['일시']>='2018-06-30 00'].reset_index(drop=True)
train_df = train_df[train_df['일시']<'2018-06-30 00'].reset_index(drop=True)

In [0]:
params = {
    'objective':'reg:squarederror',
    'n_estimators':100,
    'max_depth':2**3,
    'learning_rate':0.1,
#     'gpu_id':0,
    'tree_method':'gpu_hist',
#     'n_jobs':-1,
    'seed':42
}

oof = np.zeros(len(train_df))
pred = np.zeros(len(test_df))

feature = [i for i in train_df.columns if i not in ['target', '일시', 'date']]
kf = KFold(n_splits=5, random_state=42, shuffle=False)
gkf = GroupKFold(n_splits=5)
best_iterations = []

for trn_idx, val_idx in kf.split(train_df):
# for trn_idx, val_idx in gkf.split(train_df, groups=train_df['house']):
    tt = xgb.DMatrix(train_df.loc[trn_idx, feature], train_df.loc[trn_idx, ['target']])
    vv = xgb.DMatrix(train_df.loc[val_idx, feature], train_df.loc[val_idx, ['target']])
    
    model = xgb.train(params, tt, num_boost_round=5000, evals=[(tt, 'train'), (vv, 'val')], early_stopping_rounds=200, verbose_eval=100)
    
    oof[val_idx] = model.predict(xgb.DMatrix(train_df.loc[val_idx, feature]))
    pred += model.predict(xgb.DMatrix(test_df[feature]))/5
    best_iterations.append(model.best_iteration)
smape_fast(test_df['target'], pred, False), rmse(test_df['target'], pred, False), best_iterations, np.mean(best_iterations)

[0]	train-rmse:0.272047	val-rmse:0.356897
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[100]	train-rmse:0.124895	val-rmse:0.21508
[200]	train-rmse:0.120044	val-rmse:0.215112
[300]	train-rmse:0.116839	val-rmse:0.215539
Stopping. Best iteration:
[135]	train-rmse:0.122627	val-rmse:0.214759

[0]	train-rmse:0.293754	val-rmse:0.278319
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[100]	train-rmse:0.131425	val-rmse:0.141662
[200]	train-rmse:0.125567	val-rmse:0.141293
[300]	train-rmse:0.121998	val-rmse:0.141372
Stopping. Best iteration:
[190]	train-rmse:0.126016	val-rmse:0.141195

[0]	train-rmse:0.298346	val-rmse:0.257881
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 200 rounds.
[100]	train-rmse:0.131992	val-rmse:0.137966

In [0]:
# predictions
train_df = pd.concat([train_df, test_df]).reset_index(drop=True)
params = {
    'objective':'reg:squarederror',
    'n_estimators':10000,
    'max_depth':2**3,
    'learning_rate':0.03,
    'n_jobs':-1,
    'seed':42
}

temp = pd.DataFrame(pd.Series(pd.date_range('2018-07-01', '2018-11-01')).dt.to_period('m').unique(), columns=['일시'])
temp = pd.merge(temp, weather1, how='left', on='일시')

test_df = pd.DataFrame()
for col in sub['meter_id'].apply(lambda x: x.split('X')[1]):
    temp['house'] = int(col)
    test_df = pd.concat([test_df, temp])
test_df = pd.merge(test_df, day_range, how='left', on='일시')
test_df['num_day'] = test_df['평일'] + test_df['주말_공휴일']
test_df = pd.merge(test_df, train_df[['house', 'target_mean', 'target_min', 'target_max', 'target_std', 'target_quan1', 'target_quan2', 'target_quan3']].drop_duplicates(), how='left', on='house')

test_df = test_df.reset_index(drop=True)
pred = np.zeros(len(test_df))

for seed in [42, 43, 44, 45, 46]:
    params['seed']=seed
    
    tt = xgb.DMatrix(train_df[feature], train_df[['target']])
    model = xgb.train(params, tt, num_boost_round=250, evals=[(tt, 'train')], verbose_eval=100)
    pred += model.predict(xgb.DMatrix(test_df[feature]))/5
pred_xgb = pred.copy()

[0]	train-rmse:4.88609
[100]	train-rmse:0.291863
[200]	train-rmse:0.073943
[249]	train-rmse:0.062123
[0]	train-rmse:4.88609
[100]	train-rmse:0.291863
[200]	train-rmse:0.073943
[249]	train-rmse:0.062123
[0]	train-rmse:4.88609
[100]	train-rmse:0.291863
[200]	train-rmse:0.073943
[249]	train-rmse:0.062123
[0]	train-rmse:4.88609
[100]	train-rmse:0.291863
[200]	train-rmse:0.073943
[249]	train-rmse:0.062123
[0]	train-rmse:4.88609
[100]	train-rmse:0.291863
[200]	train-rmse:0.073943
[249]	train-rmse:0.062123


In [40]:
params = {
    'objective':'regression',
    'boosting_type':'gbdt',
    'metric':'rmse',
    'n_jobs':-1,
    'learning_rate':0.1,
    'num_leaves': 2**8,
    'max_depth':-1,
    'tree_learner':'serial',
    'colsample_bytree': 0.7,
    'subsample_freq':1,
    'subsample':0.7,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'n_estimators':10000,
    'max_bin':255,
    'verbose':-1,
    'seed': 42,
    'early_stopping_rounds':100
}

oof = np.zeros(len(train_df))
pred = np.zeros(len(test_df))

feature = [i for i in train_df.columns if i not in ['target', '일시', 'date', 'week']]
kf = KFold(n_splits=5, random_state=42, shuffle=False)
gkf = GroupKFold(n_splits=5)
best_iterations = []

for trn_idx, val_idx in kf.split(train_df):
# for trn_idx, val_idx in gkf.split(train_df, groups=train_df['house']):
    tt = lgb.Dataset(train_df.loc[trn_idx, feature], train_df.loc[trn_idx, ['target']])
    vv = lgb.Dataset(train_df.loc[val_idx, feature], train_df.loc[val_idx, ['target']])
    
    model = lgb.train(params, tt, valid_sets=[tt, vv], early_stopping_rounds=200, verbose_eval=0)
    
    pred += model.predict(test_df[feature])/5
    best_iterations.append(model.best_iteration)
    print('HH')
    
smape_fast(test_df['target'], pred, False), rmse(test_df['target'], pred, False), best_iterations, np.mean(best_iterations)



HH
HH
HH
HH
HH


(37.27816052839377, 0.1918552311279994, [71, 171, 387, 245, 185], 211.8)

In [0]:
# week : overfit
# all summary statistics : 41.9
# prior weekly summary statistics : 40.32

# 2가지 모두

In [55]:
# predictions
train_df = pd.concat([train_df, test_df]).reset_index(drop=True)
params = {
    'objective':'regression',
    'boosting_type':'gbdt',
    'metric':'rmse',
    'n_jobs':-1,
    'learning_rate':0.03,
    'num_leaves': 2**8,
    'max_depth':-1,
    'subsample':0.7,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'n_estimators':100,
    'max_bin':255,
    'verbose':-1,
    'seed': 42,
    'early_stopping_rounds':100
}

temp = pd.DataFrame(pd.date_range('2018-07-01', '2018-07-02', freq='h'), columns=['일시']).loc[:23]
temp = pd.merge(temp, weather3, how='left', on='일시')

test_df = pd.DataFrame()
for col in sub['meter_id'].apply(lambda x: x.split('X')[1]):
    temp['house'] = int(col)
    test_df = pd.concat([test_df, temp])
test_df

test_df['date'] = test_df['일시'].dt.date
test_df['holiday'] = test_df['date'].isin(holidays).astype(int)
test_df['weekday'] = test_df['일시'].dt.weekday
test_df['hour'] = test_df['일시'].dt.hour
test_df['working_hour'] = test_df['hour'].map({0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 
                                                9:1, 10:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:1, 17:1, 
                                                18:1, 19:1, 20:1, 21:0, 22:0, 23:0, 24:0})
test_df['is_weekend'] = test_df['일시'].dt.weekday.map({0:0, 1:0, 2:0, 3:0, 4:0, 5:1, 6:1})
test_df['is_weekend_holiday'] = (test_df['is_weekend']+test_df['holiday']).map({0:0, 1:1, 2:1})
test_df = pd.merge(test_df, train_df[['house', 'target_mean', 'target_min', 'target_max', 'target_std', 'target_quan1', 'target_quan2', 'target_quan3']], how='left', on='house')
test_df = test_df.reset_index(drop=True)

pred = np.zeros(len(test_df))

for seed in [42, 43, 44, 45, 46]:
    params['seed']=seed
    
    tt = lgb.Dataset(train_df[feature], train_df[['target']])
    model = lgb.train(params, tt, valid_sets=[tt], verbose_eval=300)
    pred += model.predict(test_df[feature])/5
pred_lgb = pred.copy()



Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's rmse: 0.117859
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's rmse: 0.117859
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's rmse: 0.117859
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's rmse: 0.117859
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's rmse: 0.117859


In [0]:
# pred = pred_lgb*0.5 + pred_xgb*0.5
test_df['pred'] = np.expm1(pred_lgb)

In [0]:
sub_df = test_df.groupby(['house', '일시'])['pred'].sum().unstack().reset_index()

In [60]:
from google.colab import drive
drive.mount('/content/drive')
path = 'drive/My Drive/11dacon/submit/'

sub_df.to_csv(path+'hour_prediction_lgb_add_feature.csv', index=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd

In [4]:
from google.colab import drive
drive.mount('/content/drive')
path = 'drive/My Drive/11dacon/submit/'
a = pd.read_pickle(path+'aaa.pkl')
a.to_csv(path+'hour_predicion_lgb.csv', index=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
