In [1]:
# install
!pip install workalendar
from workalendar.asia import SouthKorea

# !pip3 install tsfresh
# from tsfresh.feature_extraction import MinimalFCParameters, EfficientFCParameters ,ComprehensiveFCParameters



In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os
import gc
import tqdm
import datetime
import random
from collections import defaultdict
from sklearn.neural_network import MLPRegressor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit, GroupKFold

# model
import xgboost as xgb
import lightgbm as lgb

# evaluation
from sklearn.metrics import mean_squared_error

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from google.colab import drive
drive.mount('/content/drive')
path = 'drive/My Drive/11dacon/data/'

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    
from numba import jit
import math

@jit
def smape_fast(y_true, y_pred, exp=True):
    
    if exp:
        y_true = np.expm1(np.array(y_true))
        y_pred = np.expm1(np.array(y_pred))
    else:
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        c = a+b
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return out

def rmse(y_true, y_pred, exp=True):
    if exp:
        return np.sqrt(mean_squared_error(np.expm1(y_true), np.expm1(y_pred)))
    else:
        return np.sqrt(mean_squared_error(y_true, y_pred))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
sub = pd.read_csv(path+'submission.csv')

holidays = pd.concat([pd.Series(np.array(SouthKorea().holidays(2018))[:, 0]), pd.Series(np.array(SouthKorea().holidays(2017))[:, 0]), pd.Series(np.array(SouthKorea().holidays(2016))[:, 0])]).reset_index(drop=True)

weather3 = pd.read_csv(path+'weather_hour.csv', encoding='cp949').iloc[:, [1, 2, 3, 4, 5, 7, 8]]
weather3.columns = ['일시', '기온', '강수량', '풍속', '습도', '날씨', '전운량']

weather3['날씨'] = LabelEncoder().fit_transform(weather3['날씨'].fillna('no value'))
weather3 = weather3.fillna(0)
weather3['일시'] = pd.to_datetime(weather3['일시'])

In [0]:
def merge(train, test_out=False):
    temp2 = pd.DataFrame()
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    for col in train.columns[1:]:
        temp = train[['Time', col]].dropna().rename(columns={col:'target'})
        temp['house'] = int(col.replace('X', ''))
        temp = temp[temp['Time']>='2018-01-01'].reset_index(drop=True)

        temp['Time'] = pd.to_datetime(temp['Time'])
        temp['month'] = temp['Time'].dt.month
        temp['date'] = temp['Time'].dt.date
        temp['holiday'] = temp['date'].isin(holidays).astype(int)
        temp['week'] = temp['Time'].dt.week
        temp['weekday'] = temp['Time'].dt.weekday
        temp['hour'] = temp['Time'].dt.hour
        temp['working_hour'] = temp['hour'].map({0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 
                                                        9:1, 10:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:1, 17:1, 
                                                        18:1, 19:1, 20:1, 21:0, 22:0, 23:0, 24:0})
        temp['is_weekend'] = temp['Time'].dt.weekday.map({0:0, 1:0, 2:0, 3:0, 4:0, 5:1, 6:1})
        temp['is_weekend_holiday'] = (temp['is_weekend']+temp['holiday']).map({0:0, 1:1, 2:1})
        
        if test_out:
            temp2 = pd.DataFrame(pd.date_range('2018-07-01', '2018-07-02', freq='h'), columns=['Time']).loc[:23]
            temp2['house'] = int(col.replace('X', ''))
            temp2 = temp2[temp2['Time']>='2018-01-01'].reset_index(drop=True)

            temp2['Time'] = pd.to_datetime(temp2['Time'])
            temp2['month'] = temp2['Time'].dt.month
            temp2['date'] = temp2['Time'].dt.date
            temp2['holiday'] = temp2['date'].isin(holidays).astype(int)
            temp2['week'] = temp2['Time'].dt.week
            temp2['weekday'] = temp2['Time'].dt.weekday
            temp2['hour'] = temp2['Time'].dt.hour
            temp2['working_hour'] = temp2['hour'].map({0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 
                                                            9:1, 10:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:1, 17:1, 
                                                            18:1, 19:1, 20:1, 21:0, 22:0, 23:0, 24:0})
            temp2['is_weekend'] = temp['Time'].dt.weekday.map({0:0, 1:0, 2:0, 3:0, 4:0, 5:1, 6:1})
            temp2['is_weekend_holiday'] = (temp2['is_weekend']+temp2['holiday']).map({0:0, 1:1, 2:1})
            
            temp2['target_mean'] = temp['target'].mean()
            temp2['target_min'] = temp['target'].min()
            temp2['target_max'] = temp['target'].max()
            temp2['target_std'] = temp['target'].std()
            temp2['target_quan1'] = temp['target'].quantile(.25)
            temp2['target_quan2'] = temp['target'].quantile(.5)
            temp2['target_quan3'] = temp['target'].quantile(.75)
            temp2['target_quan4'] = temp['target'].quantile(.125)
            temp2['target_quan5'] = temp['target'].quantile(.375)
            temp2['target_quan6'] = temp['target'].quantile(.675)
            temp2['target_quan7'] = temp['target'].quantile(.925)
        else:
            temp['target_mean'] = temp['target'].mean()
            temp['target_min'] = temp['target'].min()
            temp['target_max'] = temp['target'].max()
            temp['target_std'] = temp['target'].std()
            temp['target_quan1'] = temp['target'].quantile(.25)
            temp['target_quan2'] = temp['target'].quantile(.5)
            temp['target_quan3'] = temp['target'].quantile(.75)
            temp['target_quan4'] = temp['target'].quantile(.125)
            temp['target_quan5'] = temp['target'].quantile(.375)
            temp['target_quan6'] = temp['target'].quantile(.675)
            temp['target_quan7'] = temp['target'].quantile(.925)

        
        def local_merge(temp2, condition=False, test_out=False, name='temp'):
            if test_out:
                temp2[name] = temp2['week'].map(condition)
                return temp2
            else:
                return 0
            
        temp_week = temp.groupby('week')['target'].mean().reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        name = 'prior_target_mean'
        temp[name] = temp['week'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        
        temp_week = temp.groupby('week')['target'].min().reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        name = 'prior_target_min'
        temp[name] = temp['week'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        temp_week = temp.groupby('week')['target'].max().reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        name='prior_target_max'
        temp[name] = temp['week'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        temp_week = temp.groupby('week')['target'].std().reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        name = 'prior_target_std'
        temp[name] = temp['week'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        temp_week = temp.groupby('week')['target'].quantile(.25).reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        name = 'prior_target_quan1'
        temp[name] = temp['week'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        temp_week = temp.groupby('week')['target'].quantile(.5).reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        name = 'prior_target_quan2'
        temp[name] = temp['week'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        temp_week = temp.groupby('week')['target'].quantile(.75).reset_index().rename(columns={'target':'prior_target'})
        temp_week['week'] = temp_week['week']+1
        temp_week = temp_week.set_index('week').to_dict()['prior_target']
        name = 'prior_target_quan3'
        temp[name] = temp['week'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        def local_merge(temp2=False, condition=False, test_out=False, name=False):
            if test_out:
                temp2[name] = temp2['month'].map(condition)
                return temp2
            else:
                return 0
            
        temp_week = temp.groupby('month')['target'].mean().reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        name = 'prior_target_mean_month'
        temp[name] = temp['month'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        temp_week = temp.groupby('month')['target'].min().reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        name = 'prior_target_min_month'
        temp[name] = temp['month'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        temp_week = temp.groupby('month')['target'].max().reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        name = 'prior_target_max_month'
        temp[name] = temp['month'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        temp_week = temp.groupby('month')['target'].std().reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        name = 'prior_target_std_month'
        temp[name] = temp['month'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        temp_week = temp.groupby('month')['target'].quantile(.25).reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        name= 'prior_target_quan1_month'
        temp[name] = temp['month'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        temp_week = temp.groupby('month')['target'].quantile(.5).reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        name = 'prior_target_quan2_month'
        temp[name] = temp['month'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        temp_week = temp.groupby('month')['target'].quantile(.75).reset_index().rename(columns={'target':'prior_target'})
        temp_week['month'] = temp_week['month']+1
        temp_week = temp_week.set_index('month').to_dict()['prior_target']
        name = 'prior_target_quan3_month'
        temp[name] = temp['month'].map(temp_week)
        temp2 = local_merge(temp2, temp_week, test_out, name)
        
        temp = temp.dropna()
        
        if test_out:
            test_df = pd.concat([test_df, temp2]).reset_index(drop=True)
        else:
            train_df = pd.concat([train_df, temp]).reset_index(drop=True)
    
    if test_out:
        test_df.rename(columns={'Time':'일시'}, inplace=True)
        test_df = pd.merge(test_df, weather3, how='left', on='일시')
        return test_df
    else:
        train_df.rename(columns={'Time':'일시'}, inplace=True)
        train_df = pd.merge(train_df, weather3, how='left', on='일시')
        return train_df

In [0]:
# train_df2 = merge(train)
train_df = merge(test)

# train_df = pd.concat([train_df, train_df2]).reset_index(drop=True)

train_df['target'] = np.log1p(train_df['target'])

test_df = train_df[train_df['일시']>='2018-06-30 00'].reset_index(drop=True)
train_df = train_df[train_df['일시']<'2018-06-30 00'].reset_index(drop=True)

In [0]:
# train_df['Time'] = train_df['Time'].astype(str)

In [0]:
# from tsfresh import extract_features
# extracted_features = extract_features(train_df.reset_index(drop=True).reset_index()[['index', 'target', 'Time']], column_id="index", column_sort="Time", n_jobs=16, chunksize=None)

In [0]:
# from tsfresh import select_features
# from tsfresh.utilities.dataframe_functions import impute

# impute(extracted_features)
# features_filtered = select_features(extracted_features, train_df.reset_index(drop=True).reset_index()['target'])

##### XGB

In [0]:
params = {
    'objective':'reg:squarederror',
    'n_estimators':10000,
    'max_depth':2**3,
    'learning_rate':0.03,
    'n_jobs':-1,
    'seed':42
}

oof = np.zeros(len(train_df))
pred = np.zeros(len(test_df))

feature = [i for i in train_df.columns if i not in ['target', '일시', 'date']]
kf = KFold(n_splits=5, random_state=42, shuffle=False)
gkf = GroupKFold(n_splits=5)
best_iterations = []

for trn_idx, val_idx in kf.split(train_df):
# for trn_idx, val_idx in gkf.split(train_df, groups=train_df['house']):
    tt = xgb.DMatrix(train_df.loc[trn_idx, feature], train_df.loc[trn_idx, ['target']])
    vv = xgb.DMatrix(train_df.loc[val_idx, feature], train_df.loc[val_idx, ['target']])
    
    model = xgb.train(params, tt, num_boost_round=5000, evals=[(tt, 'train'), (vv, 'val')], early_stopping_rounds=200, verbose_eval=0)
    
    oof[val_idx] = model.predict(xgb.DMatrix(train_df.loc[val_idx, feature]))
    pred += model.predict(xgb.DMatrix(test_df[feature]))/5
    best_iterations.append(model.best_iteration)
smape_fast(test_df['target'], pred, False), rmse(test_df['target'], pred, False), best_iterations, np.mean(best_iterations)

In [0]:
# predictions
train_df = pd.concat([train_df, test_df]).reset_index(drop=True)
params = {
    'objective':'reg:squarederror',
    'n_estimators':10000,
    'max_depth':2**3,
    'learning_rate':0.03,
    'n_jobs':-1,
    'seed':42
}

temp = pd.DataFrame(pd.Series(pd.date_range('2018-07-01', '2018-11-01')).dt.to_period('m').unique(), columns=['일시'])
temp = pd.merge(temp, weather1, how='left', on='일시')

test_df = pd.DataFrame()
for col in sub['meter_id'].apply(lambda x: x.split('X')[1]):
    temp['house'] = int(col)
    test_df = pd.concat([test_df, temp])
test_df = pd.merge(test_df, day_range, how='left', on='일시')
test_df['num_day'] = test_df['평일'] + test_df['주말_공휴일']
test_df = pd.merge(test_df, train_df[['house', 'target_mean', 'target_min', 'target_max', 'target_std', 'target_quan1', 'target_quan2', 'target_quan3']].drop_duplicates(), how='left', on='house')

test_df = test_df.reset_index(drop=True)
pred = np.zeros(len(test_df))

for seed in [42, 43, 44, 45, 46]:
    params['seed']=seed
    
    tt = xgb.DMatrix(train_df[feature], train_df[['target']])
    model = xgb.train(params, tt, num_boost_round=250, evals=[(tt, 'train')], verbose_eval=100)
    pred += model.predict(xgb.DMatrix(test_df[feature]))/5
pred_xgb = pred.copy()

##### LGB

In [7]:
params = {
    'objective':'regression',
    'boosting_type':'gbdt',
    'metric':'rmse',
    'n_jobs':-1,
    'learning_rate':0.03,
    'num_leaves': 2**8,
    'max_depth':-1,
    'tree_learner':'serial',
    'colsample_bytree': 0.7,
    'subsample_freq':1,
    'subsample':0.7,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'n_estimators':10000,
    'max_bin':255,
    'verbose':-1,
    'seed': 42,
    'early_stopping_rounds':100
}

oof = np.zeros(len(train_df))
pred = np.zeros(len(test_df))

feature = [i for i in train_df.columns if i not in ['target', '일시', 'date', 'month', 'week']]
kf = KFold(n_splits=5, random_state=42, shuffle=False)
gkf = GroupKFold(n_splits=5)
best_iterations = []

for trn_idx, val_idx in kf.split(train_df):
    tt = lgb.Dataset(train_df.loc[trn_idx, feature], train_df.loc[trn_idx, ['target']])
    vv = lgb.Dataset(train_df.loc[val_idx, feature], train_df.loc[val_idx, ['target']])
    
    model = lgb.train(params, tt, valid_sets=[tt, vv], early_stopping_rounds=200, verbose_eval=0)
    
    pred += model.predict(test_df[feature])/5
    best_iterations.append(model.best_iteration)
    print('HH')
    del model
    gc.collect()
    
smape_fast(test_df['target'], pred, False), rmse(test_df['target'], pred, False), best_iterations, np.mean(best_iterations)



HH
HH
HH
HH
HH


(37.08619982149266, 0.19718739171855024, [240, 879, 1169, 1399, 1462], 1029.8)

In [0]:
# week : overfit
# all summary statistics : 41.9
# prior weekly summary statistics : 40.32

# using train, test dataset : 37.x

In [41]:
# predictions
train_df = pd.concat([train_df, test_df]).reset_index(drop=True)
test_df = merge(test, test_out=True)

pred = np.zeros(len(test_df))

params['n_estimators']=1050
for seed in [42, 43, 44, 45, 46]:
    params['seed']=seed
    
    tt = lgb.Dataset(train_df[feature], train_df[['target']])
    model = lgb.train(params, tt, valid_sets=[tt], verbose_eval=300)
    pred += model.predict(test_df[feature])/5
    del model
    gc.collect()
pred_lgb = pred.copy()



Training until validation scores don't improve for 100 rounds.
[300]	training's rmse: 0.122338
[600]	training's rmse: 0.117379
[900]	training's rmse: 0.114166
Did not meet early stopping. Best iteration is:
[1050]	training's rmse: 0.112878
Training until validation scores don't improve for 100 rounds.
[300]	training's rmse: 0.122389
[600]	training's rmse: 0.117264
[900]	training's rmse: 0.114041
Did not meet early stopping. Best iteration is:
[1050]	training's rmse: 0.112793
Training until validation scores don't improve for 100 rounds.
[300]	training's rmse: 0.122225
[600]	training's rmse: 0.117219
[900]	training's rmse: 0.114048
Did not meet early stopping. Best iteration is:
[1050]	training's rmse: 0.112842
Training until validation scores don't improve for 100 rounds.
[300]	training's rmse: 0.122347
[600]	training's rmse: 0.117404
[900]	training's rmse: 0.114182
Did not meet early stopping. Best iteration is:
[1050]	training's rmse: 0.112907
Training until validation scores don't i

##### submission

In [0]:
# pred = pred_lgb*0.5 + pred_xgb*0.5
test_df['pred'] = np.expm1(pred_lgb)

In [0]:
sub_df = test_df.groupby(['house', '일시'])['pred'].sum().unstack().reset_index()

In [51]:
from google.colab import drive
drive.mount('/content/drive')
path = 'drive/My Drive/11dacon/submit/'

sub_df.to_csv(path+'hour_predicion_lgb.csv', index=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


TypeError: ignored

In [0]:
sub_df.to_pickle(path+'aaa.pkl')