In [1]:
import os
import sys
import random
import argparse
from pathlib import Path

import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl

from pytorch_forecasting.data import TimeSeriesDataSet, GroupNormalizer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from pytorch_forecasting.metrics import SMAPE
from pytorch_forecasting.models import TemporalFusionTransformer


## TFT 샘플 코드

코드 : <https://dacon.io/competitions/official/235736/codeshare/2897>

- `num` : 건물 번호
- `date_time` : 시간
- `전력사용량(kWh)` : 전력 사용량
- 기온
- 풍속
- 습도
- 강수량
- 일조
- 비전기냉방설비운영
- 태양광 보유

In [9]:
#category columns
CATE_COLS = ['num', 'mgrp', 'holiday', 'dow', 'cluster', 'hot', 'nelec_cool_flag', 'solar_flag']

# building cluster based on kmeans
CLUSTER = {
    0: [19, 20, 21, 49, 50, 51],
    1: [1, 5, 9, 34],
    2: [4, 10, 11, 12, 28, 29, 30, 36, 40, 41, 42, 59, 60],
    3: [2, 3, 6, 7, 8, 13, 14, 15, 16, 17, 18, 22, 23, 24, 25, 26, 27, 31, 32, 33, 35, 37, 38, 39, 43, 44, 45, 46, 47, 48, 52, 53, 54, 55, 56, 57, 58],
}

# length of training data for prediction (5 weeks)
ENCODER_LENGTH_IN_WEEKS = 5

# learning rate determined by a cv run with train data less 1 trailing week as validation 
LRS = [0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306 , 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.005099279397234306, 0.005099279397234306, 0.005099279397234306, 0.005099279397234306,
       0.005099279397234306, 0.005099279397234306, 0.005099279397234306, 0.005099279397234306,
       0.005099279397234306, 0.0005099279397234307, 0.0005099279397234307, 0.0005099279397234307,
       0.0005099279397234307, 0.0005099279397234307, 0.0005099279397234307]

# number of epochs found in cv run
NUM_EPOCHS = 66

# number of seeds to use
NUM_SEEDS = 10

BATCH_SIZE = 128

# hyper parameters determined by cv runs with train data less 1 trailing week as validation 
PARAMS = {
    'gradient_clip_val': 0.9658579636307634,
    'hidden_size': 180,
    'dropout': 0.19610151695402608,
    'hidden_continuous_size': 90,
    'attention_head_size': 4,
    'learning_rate': 0.08
}

In [4]:
DATA_ROOT = '/energy'
CKPTROOT = DATA_ROOT + '/ckpts'
CSVROOT = DATA_ROOT + '/csvs'
SUBFN = DATA_ROOT + '/sub.csv'
LOGDIR = DATA_ROOT + '/logs'

In [2]:
def seed_all(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
def _data_prep(df):
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['hour'] = df['datetime'].dt.hour
    df['dow'] = df['datetime'].dt.weekday
    df['date'] = df['datetime'].dt.date.astype('str')
    df['day'] = df['datetime'].dt.day
    df['month'] = df['datetime'].dt.month
    
    # FEATURE : SAT, SUN and special holidays flagged as 1 in 'holiday' flag
    df['holiday'] = df['dow'].isin([5, 6]).astype(int)
    special_days = ['2020-06-06', '2020-08-15', '2020-08-17']
    df.loc[df.date.isin(special_days), 'holiday'] = 1
    
    # FEATURE : 'hot' flag when the next day is holiday
    hot = df.groupby('date').first()['holiday'].shift(-1).fillna(0).astype(int)
    hot = hot.to_frame().reset_index().rename({'holiday' : 'hot'}, axis=1)
    df = df.merge(hot, on='date', how='left')
    
    # FEATURE : 'cumhol' - how many days left in 연휴
    h = (df.groupby('date').first()['holiday'] != 0).iloc[::-1]
    df1 = h.cumsum() - h.cumsum().where(~h).ffill().fillna(0).astype(int).iloc[::-1]
    df1 = df1.to_frame().reset_index().rename({'holiday' : 'cumhol'}, axis=1)
    df = df.merge(df1, on='date', how='left')
    
    return df

In [6]:
# read data, process date and assign cluster number

def _read_df():
    train_columns = ['num', 'datetime', 'target', 'temperature', 'windspeed', 'humidity', \
                    'precipitation', 'insolation', 'nelec_cool_flag', 'solar_flag']
    test_columns = [c for c in train_columns if c != 'target']
    
    train_df = pd.read_csv(DATAROOT + '/train.csv', skiprows=[0], names=train_columns)
    test_df = pd.read_csv(DATAROOT + '/test.csv', skiprows=[0], names=test_columns)
    
    _sz = train_df.shape[0]
    
    df = pd.concat([train_df, test_df])
    
    for k, nums in CLUSTER.items():
        df.loc[df.num.isin(nums), 'cluster'] = k
    
    df = _date_prep(df)
    
    return df.iloc[:_sz].copy(), df.iloc[_sz:].copy()

In [None]:
# add aggregate(mean) target feature for 'cluster', 'building', 'mgrp' per date
def add_feats(df):
    df.reset_index(drop=True, inplace=True)
    
    cols = ['target']
    stats = ['mean']
    
    # target null in test set to n