# [ LG에너지 솔루션_DX_Intensive_Course ] 시계열 데이터 분석을 위한 딥러닝

## 트랜스포머 기반의 시계열 데이터 분석

In [None]:
python src/main.py 
--output_dir path/to/experiments 
--comment "regression from Scratch" 
--name $1_fromScratch_Regression 
--records_file Regression_records.xls 
--data_dir path/to/Datasets/Regression/$1/ 
--data_class tsra 
--pattern TRAIN 
--val_pattern TEST 
--epochs 100 
--lr 0.001 
--optimizer RAdam  
--pos_encoding learnable 
--task regression

In [None]:
python src/main.py 
--output_dir experiments 
--comment "pretraining through imputation" 
--name $1_pretrained 
--records_file Imputation_records.xls 
--data_dir /path/to/$1/ 
--data_class tsra 
--pattern TRAIN 
--val_ratio 0.2 
--epochs 700 
--lr 0.001 
--optimizer RAdam 
--batch_size 32 
--pos_encoding learnable 
--d_model 128

In [None]:
python src/main.py 
--output_dir experiments 
--comment "finetune for regression" 
--name BeijingPM25Quality_finetuned 
--records_file Regression_records.xls 
--data_dir /path/to/Datasets/Regression/BeijingPM25Quality/ 
--data_class tsra 
--pattern TRAIN 
--val_pattern TEST  
--epochs 200 
--lr 0.001 
--optimizer RAdam 
--pos_encoding learnable 
--d_model 128 
--load_model path/to/BeijingPM25Quality_pretrained/checkpoints/model_best.pth 
--task regression 
--change_output 
--batch_size 128

## SETP 0. 환경 구축하기

In [52]:
!pip install -r ./mvts_transformer/requirements.txt

Collecting xlutils
  Downloading xlutils-2.0.0-py2.py3-none-any.whl (55 kB)
     |████████████████████████████████| 55 kB 351 kB/s             


Installing collected packages: xlutils
Successfully installed xlutils-2.0.0


In [103]:
import os
import sys
import json
import time
import random
import importlib
import numpy as np
import pandas as pd
import pickle
from easydict import EasyDict
import matplotlib.pyplot as plt

import torch
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn

# from trainer import Trainer
from mvts_transformer.src.options import Options
from mvts_transformer.src.running import setup, pipeline_factory, validate, check_progress, NEG_METRICS
from mvts_transformer.src.utils import utils
from mvts_transformer.src.datasets.data import data_factory, Normalizer
from mvts_transformer.src.datasets.datasplit import split_dataset
from mvts_transformer.src.models.ts_transformer import model_factory
from mvts_transformer.src.models.loss import get_loss_module
from mvts_transformer.src.optimizers import get_optimizer

import warnings
warnings.filterwarnings("ignore")

#check torch version & device
print ("Python version:[%s]."%(sys.version))
print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device)) # device에 cuda:0가 프린트 된다면 GPU를 사용하는 상태입니다

Python version:[3.6.9 (default, Dec  8 2021, 21:08:43) 
[GCC 8.4.0]].
PyTorch version:[1.7.1].
device:[cuda:0].


In [4]:
# set random seed 

def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
random_seed = 42
set_seed(random_seed)

### SETP 1. 데이터 준비하기

금일 실습에서는 BeijingPM25 데이터를 활용하여 시계열 회귀를 진행합니다.
* 해당 데이터는 연구용 시계열 데이터로 실제 데이터가 아닌 합성 데이터들로 이루어짐
* 여러가지 데이터중 한가지 시계열 데이터를 가져와 학습 및 추론을 수행함
* 데이터셋 출처
    * https://webscope.sandbox.yahoo.com/catalog.php?datatype=s&did=70

In [68]:
args2 = EasyDict({
    # for dataloader 
    'output_dir': './mvts_transformer/output',
    'data_dir': './mvts_transformer/data/BeijingPM25Quality',
    'name': 'pretrained',
    'records_file': 'Imputation_records.xls',
    
    
    # System
    'print_interval': 1,
    'gpu': 0,
    'n_proc': -1,
    'num_workers': 0,
    
    # Dataset
    'limit_size': None,
    'data_class': 'tsra',
    'pattern': 'TRAIN',
    'val_ratio': 0.2,
    'epoch': 700,
    'lr': 0.001,
    'optimizer': 'RAdam',
    'batch_size': 32,
    'pos_encoding': 'learnable',
    'd_model': 128
    
#     'root_path':'./LG_ES_Transformer/data/yahoo_S5/A2Benchmark/',
#     'data_name':'synthetic_1.csv',
#     'num_features':1,
#     'make_plot':True,
#     'test_ratio':0.6,
#     'valid_ratio':0.1,
#     'normal':True,
#     'window_size':48,
#     'batch_size':128,
#     'slide_size':1,
#     'forecast':True,
#     'forecast_step':1,

#     # for training
#     'training':True,
#     'checkpoint':'best',
#     'cuda':True,
#     'n_feature':1,
#     'lr':0,
#     'epochs':30,
#     'step_size':5,
#     'gamma':1.0,
#     'version':0,

#     # for gradient clipping
#     'clip':False,
#     'max_norm':1.0,
    
#     # for early stopping
#     'early_stopping':True,
#     'patience':300,
#     'save_list':False,
#     'min_epoch':0,

#     # for Transformer
#     'feature_size':256,
#     'num_layers':1,
#     'dropout':0.1,
#     'nhead':8,
#     'activation':None,
#     'initrange':0.1,

#     # for model
#     'model':'STOC',
#     'trend_learning':False,
    
#     # for testing
#     'eval_plot':True,
#     'anomaly_plot':True,
#     'shuffle':False,
#     'pred_one':False,

#     # for save
#     'experiment_name':'STOC_test'
})

In [26]:
from mvts_transformer.src.options import Options

In [39]:
args = Options()

In [40]:
args = args.parser.parse_args([])

In [41]:
args.__dict__

{'config_filepath': None,
 'output_dir': './output',
 'data_dir': './data',
 'load_model': None,
 'resume': False,
 'change_output': False,
 'save_all': False,
 'experiment_name': '',
 'comment': '',
 'no_timestamp': False,
 'records_file': './records.xls',
 'console': False,
 'print_interval': 1,
 'gpu': '0',
 'n_proc': -1,
 'num_workers': 0,
 'seed': None,
 'limit_size': None,
 'test_only': None,
 'data_class': 'weld',
 'labels': None,
 'test_from': None,
 'test_ratio': 0,
 'val_ratio': 0.2,
 'pattern': None,
 'val_pattern': None,
 'test_pattern': None,
 'normalization': 'standardization',
 'norm_from': None,
 'subsample_factor': None,
 'task': 'imputation',
 'masking_ratio': 0.15,
 'mean_mask_length': 3,
 'mask_mode': 'separate',
 'mask_distribution': 'geometric',
 'exclude_feats': None,
 'mask_feats': '0, 1',
 'start_hint': 0.0,
 'end_hint': 0.0,
 'harden': False,
 'epochs': 400,
 'val_interval': 2,
 'optimizer': 'Adam',
 'lr': 0.001,
 'lr_step': '1000000',
 'lr_factor': '0.1',
 'b

In [69]:
args.__dict__.update(args2)

In [70]:
args.__dict__

{'config_filepath': None,
 'output_dir': './mvts_transformer/output',
 'data_dir': './mvts_transformer/data/BeijingPM25Quality',
 'load_model': None,
 'resume': False,
 'change_output': False,
 'save_all': False,
 'experiment_name': '',
 'comment': '',
 'no_timestamp': False,
 'records_file': 'Imputation_records.xls',
 'console': False,
 'print_interval': 1,
 'gpu': 0,
 'n_proc': -1,
 'num_workers': 0,
 'seed': None,
 'limit_size': None,
 'test_only': None,
 'data_class': 'tsra',
 'labels': None,
 'test_from': None,
 'test_ratio': 0,
 'val_ratio': 0.2,
 'pattern': 'TRAIN',
 'val_pattern': None,
 'test_pattern': None,
 'normalization': 'standardization',
 'norm_from': None,
 'subsample_factor': None,
 'task': 'imputation',
 'masking_ratio': 0.15,
 'mean_mask_length': 3,
 'mask_mode': 'separate',
 'mask_distribution': 'geometric',
 'exclude_feats': None,
 'mask_feats': '0, 1',
 'start_hint': 0.0,
 'end_hint': 0.0,
 'harden': False,
 'epochs': 400,
 'val_interval': 2,
 'optimizer': 'RAdam

In [59]:
!mkdir ./mvts_transformer/output

In [60]:
config = setup(args)

2023-06-04 11:19:41,160 | INFO : Stored configuration file in './mvts_transformer/output/_2023-06-04_11-19-41_Nbr'


In [65]:
data_class = data_factory[config['data_class']]
data_class

mvts_transformer.src.datasets.data.TSRegressionArchive

In [71]:
my_data = data_class(config['data_dir'], pattern=config['pattern'], n_proc=config['n_proc'], limit_size=config['limit_size'], config=config)

11942it [00:40, 294.65it/s]


In [73]:
feat_dim = my_data.feature_df.shape[1]

In [74]:
my_data.feature_df

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8
0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,4.4
0,4.0,7.0,300.0,77.0,-1.1,1023.2,-18.2,0.0,4.7
0,5.0,10.0,300.0,73.0,-1.1,1023.5,-18.2,0.0,5.6
0,11.0,11.0,300.0,72.0,-1.4,1024.5,-19.4,0.0,3.1
0,12.0,12.0,300.0,72.0,-2.0,1025.2,-19.5,0.0,2.0
...,...,...,...,...,...,...,...,...,...
11917,27.0,96.0,3300.0,9.0,-1.4,1026.3,-8.6,0.0,1.0
11917,34.0,99.0,3700.0,9.0,-2.5,1026.2,-8.4,0.0,1.3
11917,31.0,95.0,3100.0,9.0,-2.7,1025.8,-8.0,0.0,0.9
11917,40.0,99.0,4200.0,13.0,-3.5,1025.5,-7.6,0.0,0.4


In [75]:
validation_method = 'ShuffleSplit'
labels = None

In [76]:
test_data = my_data
test_indices = None  # will be converted to empty list in `split_dataset`, if also test_set_ratio == 0
val_data = my_data
val_indices = []

In [77]:
if config['val_pattern']:  # used if val data come from different files / file patterns
    val_data = data_class(config['data_dir'], pattern=config['val_pattern'], n_proc=-1, config=config)
    val_indices = val_data.all_IDs

In [80]:
config['val_pattern']

In [81]:
if config['val_ratio'] > 0:
    train_indices, val_indices, test_indices = split_dataset(data_indices=my_data.all_IDs,
                                                                validation_method=validation_method,
                                                                n_splits=1,
                                                                validation_ratio=config['val_ratio'],
                                                                test_set_ratio=config['test_ratio'],  # used only if test_indices not explicitly specified
                                                                test_indices=test_indices,
                                                                random_seed=1337,
                                                                labels=labels)
    train_indices = train_indices[0]  # `split_dataset` returns a list of indices *per fold/split*
    val_indices = val_indices[0]  # `split_dataset` returns a list of indices *per fold/split*
else:
    train_indices = my_data.all_IDs
    if test_indices is None:
        test_indices = []

In [82]:
train_indices

array([ 1967,  9729, 10238, ...,   860,  8381,  3223])

In [83]:
val_indices

array([  898,  9931, 11800, ...,  2083,  1906,  7427])

In [84]:
test_indices

[]

In [87]:
normalizer = None
if config['norm_from']:
    with open(config['norm_from'], 'rb') as f:
        norm_dict = pickle.load(f)
    normalizer = Normalizer(**norm_dict)
elif config['normalization'] is not None:
    normalizer = Normalizer(config['normalization'])
    my_data.feature_df.loc[train_indices] = normalizer.normalize(my_data.feature_df.loc[train_indices])
    if not config['normalization'].startswith('per_sample'):
        # get normalizing values from training set and store for future use
        norm_dict = normalizer.__dict__
        with open(os.path.join(config['output_dir'], 'normalization.pickle'), 'wb') as f:
            pickle.dump(norm_dict, f, pickle.HIGHEST_PROTOCOL)
if normalizer is not None:
    if len(val_indices):
        val_data.feature_df.loc[val_indices] = normalizer.normalize(val_data.feature_df.loc[val_indices])
    if len(test_indices):
        test_data.feature_df.loc[test_indices] = normalizer.normalize(test_data.feature_df.loc[test_indices])

In [88]:
model = model_factory(config, my_data)

In [89]:
model

TSTransformerEncoder(
  (project_inp): Linear(in_features=9, out_features=128, bias=True)
  (pos_enc): LearnablePositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerBatchNormEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=128, bias=True)
        (norm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (norm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerBatchNormEncoderLayer(
        (self_attn): Multi

In [91]:
if config['global_reg']:
    weight_decay = config['l2_reg']
    output_reg = None
else:
    weight_decay = 0
    output_reg = config['l2_reg']

In [92]:
optim_class = get_optimizer(config['optimizer'])
optimizer = optim_class(model.parameters(), lr=config['lr'], weight_decay=weight_decay)

In [93]:
start_epoch = 0
lr_step = 0  # current step index of `lr_step`
lr = config['lr']  # current learning step
# Load model and optimizer state
if args.load_model:
    model, optimizer, start_epoch = utils.load_model(model, config['load_model'], optimizer, config['resume'],
                                                        config['change_output'],
                                                        config['lr'],
                                                        config['lr_step'],
                                                        config['lr_factor'])
model.to(device)

loss_module = get_loss_module(config)

In [96]:
# Initialize data generators
dataset_class, collate_fn, runner_class = pipeline_factory(config)
val_dataset = dataset_class(val_data, val_indices)

val_loader = DataLoader(dataset=val_dataset,
                        batch_size=config['batch_size'],
                        shuffle=False,
                        num_workers=config['num_workers'],
                        pin_memory=True,
                        collate_fn=lambda x: collate_fn(x, max_len=model.max_len))

train_dataset = dataset_class(my_data, train_indices)

train_loader = DataLoader(dataset=train_dataset,
                            batch_size=config['batch_size'],
                            shuffle=True,
                            num_workers=config['num_workers'],
                            pin_memory=True,
                            collate_fn=lambda x: collate_fn(x, max_len=model.max_len))

trainer = runner_class(model, train_loader, device, loss_module, optimizer, l2_reg=output_reg,
                                print_interval=config['print_interval'], console=config['console'])
val_evaluator = runner_class(model, val_loader, device, loss_module,
                                    print_interval=config['print_interval'], console=config['console'])

In [104]:
tensorboard_writer = SummaryWriter(config['tensorboard_dir'])
best_value = 1e16 if config['key_metric'] in NEG_METRICS else -1e16  # initialize with +inf or -inf depending on key metric
metrics = []  # (for validation) list of lists: for each epoch, stores metrics like loss, ...
best_metrics = {}

    # Evaluate on validation before training
aggr_metrics_val, best_metrics, best_value = validate(val_evaluator, None, config, best_metrics,
                                                        best_value, epoch=0)
metrics_names, metrics_values = zip(*aggr_metrics_val.items())
metrics.append(list(metrics_values))



2023-06-04 11:52:48,106 | INFO : Evaluating on validation set ...


Evaluating Epoch 0   0.0% | batch:         0 of        75	|	loss: 1.07261e+07
Evaluating Epoch 0   1.3% | batch:         1 of        75	|	loss: 7.65159e+06
Evaluating Epoch 0   2.7% | batch:         2 of        75	|	loss: 6.279e+06
Evaluating Epoch 0   4.0% | batch:         3 of        75	|	loss: 6.53315e+06
Evaluating Epoch 0   5.3% | batch:         4 of        75	|	loss: 6.78696e+06
Evaluating Epoch 0   6.7% | batch:         5 of        75	|	loss: 1.03023e+07
Evaluating Epoch 0   8.0% | batch:         6 of        75	|	loss: 1.04736e+07
Evaluating Epoch 0   9.3% | batch:         7 of        75	|	loss: 6.40292e+06
Evaluating Epoch 0  10.7% | batch:         8 of        75	|	loss: 7.66732e+06
Evaluating Epoch 0  12.0% | batch:         9 of        75	|	loss: 6.49553e+06
Evaluating Epoch 0  13.3% | batch:        10 of        75	|	loss: 7.26929e+06
Evaluating Epoch 0  14.7% | batch:        11 of        75	|	loss: 8.29193e+06
Evaluating Epoch 0  16.0% | batch:        12 of        75	|	loss: 

2023-06-04 11:52:49,373 | INFO : Validation runtime: 0.0 hours, 0.0 minutes, 1.265991449356079 seconds

2023-06-04 11:52:49,374 | INFO : Avg val. time: 0.0 hours, 0.0 minutes, 1.2883978684743245 seconds
2023-06-04 11:52:49,375 | INFO : Avg batch val. time: 0.017178638246324325 seconds
2023-06-04 11:52:49,376 | INFO : Avg sample val. time: 0.0005404353475143978 seconds


Evaluating Epoch 0  88.0% | batch:        66 of        75	|	loss: 6.28423e+06
Evaluating Epoch 0  89.3% | batch:        67 of        75	|	loss: 8.20886e+06
Evaluating Epoch 0  90.7% | batch:        68 of        75	|	loss: 8.94368e+06
Evaluating Epoch 0  92.0% | batch:        69 of        75	|	loss: 6.77833e+06
Evaluating Epoch 0  93.3% | batch:        70 of        75	|	loss: 7.45283e+06
Evaluating Epoch 0  94.7% | batch:        71 of        75	|	loss: 8.92637e+06
Evaluating Epoch 0  96.0% | batch:        72 of        75	|	loss: 5.86746e+06
Evaluating Epoch 0  97.3% | batch:        73 of        75	|	loss: 6.88433e+06
Evaluating Epoch 0  98.7% | batch:        74 of        75	|	loss: 1.09129e+07



AttributeError: 'NoneType' object has no attribute 'add_scalar'

In [102]:
tensorboard_writer.add_scalar('{}/val'.format(k), v, epoch)

NameError: name 'k' is not defined