In [1]:
!pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu117
Collecting torch==1.13.1+cu117
  Downloading https://download.pytorch.org/whl/cu117/torch-1.13.1%2Bcu117-cp310-cp310-linux_x86_64.whl (1801.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 GB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.14.1+cu117
  Downloading https://download.pytorch.org/whl/cu117/torchvision-0.14.1%2Bcu117-cp310-cp310-linux_x86_64.whl (24.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.3/24.3 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==0.13.1
  Downloading https://download.pytorch.org/whl/cu117/torchaudio-0.13.1%2Bcu117-cp310-cp310-linux_x86_64.whl (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchvision, torchaudio
  Attempting uninstall: torch
    F

In [2]:
!pip install pytorch_lightning==1.9.4
!pip install pytorch_forecasting==0.10.3

Collecting pytorch_lightning==1.9.4
  Downloading pytorch_lightning-1.9.4-py3-none-any.whl (827 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m827.8/827.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch_lightning==1.9.4)
  Downloading torchmetrics-1.1.0-py3-none-any.whl (761 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m761.3/761.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.6.0.post0 (from pytorch_lightning==1.9.4)
  Downloading lightning_utilities-0.9.0-py3-none-any.whl (23 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.9.0 pytorch_lightning-1.9.4 torchmetrics-1.1.0
Collecting pytorch_forecasting==0.10.3
  Downloading pytorch_forecasting-0.10.3-py3-none-any.whl (141 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.4/141.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [101]:
import sys
import os
import argparse
import shutil
import random
from pathlib import Path

import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl

from pytorch_forecasting.data import (
    TimeSeriesDataSet,
    GroupNormalizer
)
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import (
    ModelCheckpoint,
    EarlyStopping,
    LearningRateMonitor
)
from pytorch_forecasting.metrics import SMAPE
from pytorch_forecasting.models import TemporalFusionTransformer

import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile


# category columns
CATE_COLS = ['building_num', "mgrp", 'holiday', 'dow', 'cluster', 'hot']


# building cluster based on kmeans
CLUSTER = {
    # 0 : 건물 기타
    0: [i for i in range(1, 15 + 1)],
    # 1 : 공공
    1: [i for i in range(16, 23 + 1)],
    # 2 : 대학교
    2: [i for i in range(24, 31 + 1)],
    # 3 : 데이터센터
    3: [i for i in range(32, 36 + 1)],
    # 4 : 백화점 및 아울렛
    4: [i for i in range(37, 44 + 1)],
    # 5 : 병원
    5: [i for i in range(45, 52 + 1)],
    # 6 : 상용
    6: [i for i in range(53, 60 + 1)],
    # 7 : 아파트
    7: [i for i in range(61, 68 + 1)],
    # 8 : 연구소
    8: [i for i in range(69, 76 + 1)],
    # 9 : 지식산업센터
    9: [i for i in range(77, 84 + 1)],
    # 10 : 할인마트
    10: [i for i in range(85, 92 + 1)],
    # 11 : 호텔 및 리조트
    11: [i for i in range(93, 100 + 1)]
}

# length of training data for prediction (5 weeks)
ENCODER_LENGTH_IN_WEEKS = 5

# learning rate determined by a cv run with train data less 1 trailing week as validation
LRS = [0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306, 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.05099279397234306 , 0.05099279397234306, 0.05099279397234306, 0.05099279397234306,
       0.005099279397234306, 0.005099279397234306, 0.005099279397234306, 0.005099279397234306,
       0.005099279397234306, 0.005099279397234306, 0.005099279397234306, 0.005099279397234306,
       0.005099279397234306, 0.0005099279397234307, 0.0005099279397234307, 0.0005099279397234307,
       0.0005099279397234307, 0.0005099279397234307, 0.0005099279397234307]

# number of epochs found in cv run
NUM_EPOCHS = 100

# number of seeds to use
NUM_SEEDS = 10

BATCH_SIZE = 128 #

# hyper parameters determined by cv runs with train data less 1 trailing week as validation
PARAMS = {
    'gradient_clip_val': 0.5,
    'hidden_size': 128,
    'dropout': 0.3,
    'hidden_continuous_size': 16,
    'attention_head_size': 4,
    'learning_rate': 0.001
}



In [102]:
#경로만 자신의 환경에 맞게 잘 설정해주세요!
DATAROOT='/content/drive/MyDrive/electric'
CKPTROOT = DATAROOT+"/ckpts" # directory for model checkpoints
CSVROOT = DATAROOT+"/csvs" # directory for prediction outputs
SUBFN = DATAROOT+"/sub.csv" # final submission file path
LOGDIR = DATAROOT+"/logs" # pytorch_forecasting requirs logger

In [103]:
building_df = pd.read_csv('/content/drive/MyDrive/electric/building_info.csv', encoding='UTF8')
submission_df = pd.read_csv('/content/drive/MyDrive/electric/sample_submission.csv', encoding='UTF8')
train_df = pd.read_csv('/content/drive/MyDrive/electric/final_train.csv', encoding='UTF8')
test_df = pd.read_csv('/content/drive/MyDrive/electric/final_test.csv', encoding='UTF8')
train_day = pd.read_csv('/content/drive/MyDrive/electric/train.csv', encoding='UTF8')
test_day = pd.read_csv('/content/drive/MyDrive/electric/test.csv', encoding='UTF8')

In [104]:
CATE_COLS = ['건물번호', 'holiday']

In [105]:
train_day = train_day.sort_values(by=['건물번호', '일시'])
train_day['time_idx'] = train_day.groupby('건물번호').cumcount()
test_day = test_day.sort_values(by=['건물번호', '일시'])
test_day['time_idx'] = test_day.groupby('건물번호').cumcount()

train_day = train_day[train_day.groupby('건물번호')['time_idx'].transform('max') >= 168]
train_day = train_day[train_day['time_idx'] >= 168]
train_day = train_day.reset_index(drop = True)
train_day['time_idx'] = train_day['time_idx']-168
train_df = pd.concat([train_df,train_day['time_idx']],axis=1)
test_df = pd.concat([test_df,test_day['time_idx']],axis=1)
train_df

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),태양광용량(kW),hour,day,month,week,...,type_hour_mean,type_hour_std,holiday,sin_time,cos_time,THI,CDH,7_shifted_전력소비량,전력소비량(kWh),time_idx
0,1,19.0,1.889045,3.2,61.0,0,0,2,6,23,...,1616.129012,909.159339,0,0.000000,1.000000,48.2249,-38.9,1085.28,1124.16,0
1,1,18.8,1.889045,2.6,61.0,0,1,2,6,23,...,1603.843635,921.827876,0,0.258819,0.965926,47.8649,-45.1,1047.36,1059.36,1
2,1,18.5,1.889045,2.6,62.0,0,2,2,6,23,...,1575.108376,916.043890,0,0.500000,0.866025,47.4096,-51.4,974.88,987.36,2
3,1,18.1,1.889045,2.2,63.0,0,3,2,6,23,...,1553.335094,908.635022,0,0.707107,0.707107,46.7941,-58.0,953.76,977.76,3
4,1,17.2,1.889045,3.2,66.0,0,4,2,6,23,...,1543.278965,900.520543,0,0.866025,0.500000,45.6064,-64.9,986.40,1009.92,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187195,100,23.1,2.533115,0.9,86.0,0,19,2,8,34,...,1728.999353,821.346561,0,-0.965926,0.258819,63.6624,-19.6,1049.52,881.04,1867
187196,100,22.4,2.533115,1.3,86.0,0,20,2,8,34,...,1648.600324,786.021787,0,-0.866025,0.500000,62.4024,-20.2,874.32,798.96,1868
187197,100,21.3,2.533115,1.0,92.0,0,21,2,8,34,...,1506.193235,757.941653,0,-0.707107,0.707107,64.1976,-22.3,678.24,825.12,1869
187198,100,21.0,2.533115,0.3,94.0,0,22,2,8,34,...,1263.618397,643.779849,0,-0.500000,0.866025,65.0744,-25.1,632.64,640.08,1870


In [106]:
def seed_all(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [107]:
[os.makedirs(p, exist_ok=True) for p in (CKPTROOT, CSVROOT, LOGDIR)]

[None, None, None]

In [108]:
from pytorch_forecasting.data import (
    TimeSeriesDataSet,
    GroupNormalizer
)

In [109]:
train_df.columns

Index(['건물번호', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '태양광용량(kW)', 'hour',
       'day', 'month', 'week', 'month_day_mean', 'day_hour_mean',
       'day_hour_std', 'hour_mean', 'hour_std', 'type_day_hour_mean',
       'type_day_hour_std', 'type_hour_mean', 'type_hour_std', 'holiday',
       'sin_time', 'cos_time', 'THI', 'CDH', '7_shifted_전력소비량', '전력소비량(kWh)',
       'time_idx'],
      dtype='object')

In [110]:
def load_dataset(train_df, validate=False):
    max_encoder_length = 24 * 7 *ENCODER_LENGTH_IN_WEEKS #5
    max_prediction_length = 24 * 7
    training_cutoff = train_df['time_idx'].max()-max_prediction_length #2040 - 24*7 = 1871

    tr_ds = TimeSeriesDataSet(
      train_df[lambda x: x.time_idx <=training_cutoff] if validate else train_df,
      time_idx = "time_idx",
      target = "전력소비량(kWh)",
      group_ids=["건물번호"],
      min_encoder_length = 1,
      max_encoder_length = max_encoder_length,
      min_prediction_length=1,
      max_prediction_length=max_prediction_length,


      #Known Inputs 알고 있는 변수
      time_varying_known_categoricals = CATE_COLS,
      time_varying_known_reals=[
            "time_idx",
            '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '태양광용량(kW)', 'hour',
            'day', 'month', 'week', 'month_day_mean', 'day_hour_mean',
            'type_day_hour_std', 'type_hour_mean', 'type_hour_std',
            'sin_time', 'cos_time', 'THI', 'CDH', '7_shifted_전력소비량',
            'sin_time', 'cos_time', 'THI', 'CDH', '7_shifted_전력소비량',
        ],
      target_normalizer=GroupNormalizer(groups=["건물번호"], transformation="softplus"),

      #모르고 있는 변수
      time_varying_unknown_categoricals=[],
      time_varying_unknown_reals=[
            "전력소비량(kWh)",
        ],


        add_relative_time_idx=True,  # add as feature
        add_target_scales=True,  # add as feature
        add_encoder_length=True,  # add as feature

        allow_missing_timesteps=True
    )


    va_ds = None
    if validate:
        va_ds = TimeSeriesDataSet.from_dataset(
        tr_ds, train_df, predict=True, stop_randomization=True
    )

    return tr_ds, va_ds


In [111]:
train_df['건물번호'] = train_df['건물번호'].astype('str')
train_df['holiday'] = train_df['holiday'].astype('str')
test_df['건물번호'] = test_df['건물번호'].astype('str')
test_df['holiday'] = test_df['holiday'].astype('str')

In [None]:
tr_ds, va_ds = load_dataset(train_df, validate=False)
tr_ds

In [113]:
# training
def fit(seed, tr_ds, va_loader=None):
    seed_all(seed) # doesn't really work as training is non-deterministic

    # create dataloaders for model
    tr_loader = tr_ds.to_dataloader(
        train=True, batch_size=BATCH_SIZE, num_workers=12
    )

    if va_loader is not None:
        # stop training, when loss metric does not improve on validation set
        early_stopping_callback = EarlyStopping(
            monitor="val_loss",
            min_delta=1e-4,
            patience=10,
            verbose=True,
            mode="min"
        )
        lr_logger = LearningRateMonitor(logging_interval="epoch")  # log the learning rate
        callbacks = [lr_logger, early_stopping_callback]
    else:
        # gather 10 checkpoints with best traing loss
        checkpoint_callback = ModelCheckpoint(
            monitor='train_loss',
            dirpath=CKPTROOT,
            filename=f'seed={seed}'+'-{epoch:03d}-{train_loss:.2f}',
            save_top_k=10
        )
        callbacks = [checkpoint_callback]

    # create trainer
    trainer = pl.Trainer(
        max_epochs=1,
        devices=1,
        accelerator="gpu",
        gradient_clip_val=PARAMS['gradient_clip_val'],
        limit_train_batches=30,
        callbacks=callbacks,
        logger=TensorBoardLogger(LOGDIR)
    )

    # use pre-deterined leraning rate schedule for final submission
    learning_rate = LRS if va_loader is None else PARAMS['learning_rate']

    # initialise model with pre-determined hyperparameters
    tft = TemporalFusionTransformer.from_dataset(
        tr_ds,
        learning_rate=learning_rate,
        hidden_size=PARAMS['hidden_size'],
        attention_head_size=PARAMS['attention_head_size'],
        dropout=PARAMS['dropout'],
        hidden_continuous_size=PARAMS['hidden_continuous_size'],
        output_size=1,
        loss=SMAPE(), # SMAPE loss
        log_interval=10,  # log example every 10 batches
        logging_metrics=[SMAPE()],
        reduce_on_plateau_patience=4,  # reduce learning automatically
    )
    print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

    kwargs = {'train_dataloaders': tr_loader}
    if va_loader:
        kwargs['val_dataloaders'] = va_loader

    # fit network
    trainer.fit(
        tft,
        **kwargs
    )

    best_model_path = trainer.checkpoint_callback.best_model_path
    print(f'best_model_path={best_model_path}')
    best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

    return best_tft


In [114]:
#seed=[17, 1218, 20230725, 1998, 32, 40, 800, 6651, 4931, 18011810]
seed=[724, 313, 9377, 9555, 126, 877, 7777, 1004, 725, 4598723]
#seed=[724]

for s in seed:
    fit(s, tr_ds)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | SMAPE                           | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 2.1 K 
3  | presca

Number of parameters in network: 4031.6k


  rank_zero_warn(


Training: 0it [00:00, ?it/s]

OutOfMemoryError: ignored

In [None]:
# predict 1 week
def forecast(ckpt, train_df, test_df):
    # load model
    best_tft = TemporalFusionTransformer.load_from_checkpoint(ckpt)
    max_encoder_length = best_tft.dataset_parameters['max_encoder_length']
    max_prediction_length = best_tft.dataset_parameters['max_prediction_length']

    assert max_encoder_length == 5*24*7 and max_prediction_length == 1*24*7

    # use 5 weeks of training data at the end
    encoder_data = train_df[lambda x: x.time_idx > x.time_idx.max() - max_encoder_length]

    # get last entry from training data
    last_data = train_df.iloc[[-1]]

    # fill NA target value in test data with last values from the train dataset
    target_cols = [c for c in test_df.columns if 'target' in c]
    for c in target_cols:
        test_df.loc[:, c] = last_data[c].item()

    decoder_data = test_df

    # combine encoder and decoder data. decoder data is to be predicted
    new_prediction_data = pd.concat([encoder_data, decoder_data], ignore_index=True)
    new_raw_predictions, new_x = best_tft.predict(new_prediction_data, mode="raw", return_x=True)

    # num_labels: mapping from 'num' categorical feature to index in new_raw_predictions['prediction']
    #             {'5': 4, '6': 6, ...}
    # new_raw_predictions['prediction'].shape = (60, 168, 1)
    num_labels = best_tft.dataset_parameters['categorical_encoders']['building_num'].classes_

    preds = new_raw_predictions['prediction'].squeeze()

    sub_df = pd.read_csv(DATAROOT+"/sample_submission.csv")

    # get prediction for each building (num)
    for n, ix in num_labels.items():
        sub_df.loc[(sub_df['num_date_time'].apply(lambda x : int(x.split('_')[0])) == int(n)), 'answer'] = preds[ix].numpy()

    # save predction to a csv file
    outfn = CSVROOT+'/'+(Path(ckpt).stem + '.csv')
    print(outfn)
    sub_df.to_csv(outfn, index=False)

In [None]:
def ensemble(outfn):
    # get all prediction csv files
    fns = list(glob.glob(CSVROOT+"/*.csv"))
    df0 = pd.read_csv(fns[0])
    df = pd.concat([df0] + [pd.read_csv(fn).loc[:,'answer'] for fn in fns[1:]], axis=1)
    # get median of all predcitions
    df['median'] = df.iloc[:,1:].median(axis=1)
    df = df[['num_date_time', 'median']]
    df = df.rename({'median': 'answer'}, axis=1)
    # save to submission file
    df.to_csv(outfn, index=False)

# not used for final submission
def validate(seed, tr_ds, va_ds):
    va_loader = va_ds.to_dataloader(
        train=False, batch_size=BATCH_SIZE*10, num_workers=12
    )
    best_tft = fit(seed, tr_ds, va_loader)
    actuals = torch.cat([y[0] for x, y in iter(va_loader)])
    predictions = best_tft.predict(va_loader)
    smape_per_num = SMAPE(reduction="none")(predictions, actuals).mean(1)
    print(smape_per_num)
    print(smape_per_num.mean())

In [None]:
import glob
print("### FORECAST ###")
for p in glob.glob(CKPTROOT + "/*.ckpt"):
    forecast(p, train_df, test_df)

### FORECAST ###


  rank_zero_warn(


In [None]:
print("### ENSEMBLING ###")
ensemble(CSVROOT + 'submit_batchlimit150.csv')

In [None]:
a = pd.read_csv(CSVROOT + 'submit_batchlimit150.csv')
a

In [None]:
new_raw_predictions = best_tft.predict(new_prediction_data, mode="raw", return_x=True)

for idx in range(10):  # plot 10 examples
    best_tft.plot_prediction(new_raw_predictions.x, new_raw_predictions.output, idx=idx, show_future_observed=False)

NameError: ignored

In [None]:
interpretation = best_tft.interpret_output(raw_predictions.output, reduction="sum")
best_tft.plot_interpretation(interpretation)