# PatchTSMixer workflow examples on ETTH dataset

In [1]:
from transformers import PatchTSMixerConfig, Trainer, TrainingArguments
import torch
from torch.utils.data import random_split
from torch.utils.data import Dataset
import os
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


## Data prep

Generate and prepare dummy data to test the pretraining.

In [2]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
class ETTDataset(Dataset):
    def __init__(self, root_path='/dccstor/dnn_forecasting/FM/data/ETDataset/ETT-small/', data_file='ETTh1.csv', 
                 seq_len=128, pred_len=32,
                 split='train',                  
                 scale=True
                 ):
        
        self.seq_len = seq_len
        self.pred_len = pred_len
        # init
        assert split in ['train', 'test', 'val']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[split]
                
        self.scale = scale                        

        self.root_path = root_path
        self.data_file = data_file
        self.__read_data__()

    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path, self.data_file))

        border1s = [0, 12 * 30 * 24 - self.seq_len, 12 * 30 * 24 + 4 * 30 * 24 - self.seq_len]
        border2s = [12 * 30 * 24, 12 * 30 * 24 + 4 * 30 * 24, 12 * 30 * 24 + 8 * 30 * 24]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]
        
        cols_data = df_raw.columns[1:]
        df_data = df_raw[cols_data]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values
        
        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]        

    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.seq_len
        r_begin = s_end
        r_end = r_begin + self.pred_len 

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]        
        return {"context_values": torch.Tensor(seq_x), "target_values": torch.Tensor(seq_y)}

    def __len__(self):
        return len(self.data_x) - self.seq_len - self.pred_len + 1

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)


In [3]:
FORECAST_LEN = 96
n_features = 7
SEQ_LEN = 512
seq_len = SEQ_LEN
patch_len = 16
stride = patch_len

In [4]:
dset_train = ETTDataset(split="train", seq_len=SEQ_LEN, pred_len=FORECAST_LEN)
dset_val = ETTDataset(split="val", seq_len=SEQ_LEN, pred_len=FORECAST_LEN)
dset_test = ETTDataset(split="test", seq_len=SEQ_LEN, pred_len=FORECAST_LEN)

In [5]:
dd=dset_val.__getitem__(0)
dd["context_values"].shape, dd["target_values"].shape

(torch.Size([512, 7]), torch.Size([96, 7]))

In [6]:
# num_patches should be (no need to specify)
num_patches = seq_len//patch_len
num_patches

32

## 1. Directly train a `PatchTSMixer` forecasting model, and evaluate the test performance

In [8]:
from transformers import PatchTSMixerForForecasting

forecast_config = PatchTSMixerConfig(
    in_channels=n_features,
    seq_len=SEQ_LEN,
    patch_len=patch_len,
    stride=stride,
    num_features=48,
    num_layers=2,
    dropout=0.5,
    mode="common_channel",
    revin=True,
    expansion_factor=3,
    head_dropout=0.7,
    forecast_len=FORECAST_LEN
)

forecast_model = PatchTSMixerForForecasting(forecast_config)

forecast_args = TrainingArguments(
        output_dir='./dump/etth1/direct_forecast/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/etth1/direct_forecast/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

from transformers import EarlyStoppingCallback

# Create the early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10,  # Number of epochs with no improvement after which to stop
    early_stopping_threshold=0.0001,  # Minimum improvement required to consider as improvement
)

forecast_trainer = Trainer(
    model=forecast_model,
    args=forecast_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)


In [9]:
forecast_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5273,0.747703
2,0.4202,0.713125
3,0.3936,0.697876
4,0.3784,0.69284
5,0.3706,0.684963
6,0.3644,0.685008
7,0.3597,0.681701
8,0.3587,0.682566
9,0.3553,0.679382
10,0.354,0.680346


TrainOutput(global_step=5292, training_loss=0.36701976721339036, metrics={'train_runtime': 94.3389, 'train_samples_per_second': 8515.05, 'train_steps_per_second': 267.122, 'total_flos': 711526928007168.0, 'train_loss': 0.36701976721339036, 'epoch': 21.0})

In [10]:
forecast_trainer.evaluate(dset_test)

{'eval_loss': 0.36459630727767944,
 'eval_runtime': 0.1609,
 'eval_samples_per_second': 17311.979,
 'eval_steps_per_second': 18.648,
 'epoch': 21.0}

## 2. Pretrain a `PatchTSMixer` model with HF trainer

In [33]:
from transformers import PatchTSMixerForMaskPretraining

pretrain_config = PatchTSMixerConfig(
    in_channels=n_features,
    seq_len=SEQ_LEN,
    patch_len=patch_len,
    stride=stride,
    num_features=48,
    num_layers=2,
    dropout=0.5,
    mode="common_channel",
    revin=True,
    expansion_factor=3,
    head_dropout=0.7,
    forecast_len=FORECAST_LEN
)

pretrain_model = PatchTSMixerForMaskPretraining(pretrain_config)

pretrain_args = TrainingArguments(
        output_dir='./dump/etth1/pretrain/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/etth1/pretrain/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

pretrain_trainer = Trainer(
    model=pretrain_model,
    args=pretrain_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)

In [34]:
pretrain_trainer.train()

Epoch,Training Loss,Validation Loss
1,1.0645,0.751602
2,0.8242,0.620294
3,0.7292,0.547623
4,0.6708,0.499021
5,0.6349,0.47168
6,0.6104,0.45047
7,0.594,0.43693
8,0.5807,0.427861
9,0.5729,0.417699
10,0.566,0.41022


TrainOutput(global_step=25200, training_loss=0.5301023770892431, metrics={'train_runtime': 432.8078, 'train_samples_per_second': 1856.02, 'train_steps_per_second': 58.224, 'total_flos': 852929082163200.0, 'train_loss': 0.5301023770892431, 'epoch': 100.0})

In [36]:
pretrain_trainer.evaluate(dset_test)

{'eval_loss': 0.3986164629459381,
 'eval_runtime': 0.1289,
 'eval_samples_per_second': 21608.126,
 'eval_steps_per_second': 23.276,
 'epoch': 100.0}

### Save batch and output for pytests (optional)

In [37]:
# get train dataloader
eval_dataloader = pretrain_trainer.get_eval_dataloader()
eval_dataloader

<accelerate.data_loader.DataLoaderShard at 0x14a5dabc02e0>

In [38]:
for X in eval_dataloader:
    print(X.keys())
    print(X["context_values"].shape)
    torch.save(X, "./dump/etth1/pretrain/batch.pt")
    break

dict_keys(['context_values'])
torch.Size([1024, 512, 7])


### Saving and loading the pretrained model

In [39]:
pretrain_trainer.save_model("./dump/etth1/pretrain/patchtsmixer_pretrained_etth1")

In [40]:
loaded_model = PatchTSMixerForMaskPretraining.from_pretrained('./dump/etth1/pretrain/patchtsmixer_pretrained_etth1').to("cuda")

In [41]:
torch.manual_seed(0)
loaded_model.eval()
with torch.no_grad():
    d_out = loaded_model.forward(X['context_values'].reshape(1024,seq_len,n_features).to("cuda"))
print(d_out.prediction_logits[0, :7, :1, :1])

tensor([[[-0.3092]],

        [[ 0.6213]],

        [[-0.2071]],

        [[-0.2825]],

        [[-0.4129]],

        [[ 2.3598]],

        [[ 0.1951]]], device='cuda:0')


Copy the above values in a pytest and use the dumped model to verify (optional)

## 3. Use the pretrained model (step 2) to finetune for a forecasting task

In [42]:
finetune_forecast_model = PatchTSMixerForForecasting.from_pretrained('./dump/etth1/pretrain/patchtsmixer_pretrained_etth1')

finetune_forecast_args = TrainingArguments(
        output_dir='./dump/etth1/finetune_forecast/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/etth1/finetune_forecast/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

finetune_forecast_trainer = Trainer(
    model=finetune_forecast_model,
    args=finetune_forecast_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)

Some weights of PatchTSMixerForForecasting were not initialized from the model checkpoint at ./dump/etth1/pretrain/patchtsmixer_pretrained_etth1 and are newly initialized: ['head.head.base_forecast_block.1.bias', 'head.head.base_forecast_block.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
finetune_forecast_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5211,0.798352
2,0.4219,0.750673
3,0.3964,0.727025
4,0.3821,0.716788
5,0.3745,0.705748
6,0.3681,0.704222
7,0.363,0.697547
8,0.3611,0.697067
9,0.3564,0.693689
10,0.3538,0.692488


TrainOutput(global_step=5292, training_loss=0.36609856395764684, metrics={'train_runtime': 90.1336, 'train_samples_per_second': 8912.326, 'train_steps_per_second': 279.585, 'total_flos': 711526928007168.0, 'train_loss': 0.36609856395764684, 'epoch': 21.0})

In [44]:
finetune_forecast_trainer.evaluate(dset_test)

{'eval_loss': 0.36578306555747986,
 'eval_runtime': 0.1696,
 'eval_samples_per_second': 16425.723,
 'eval_steps_per_second': 17.694,
 'epoch': 21.0}

### Save/Load and dump outputs for pytest (optional)

In [46]:
finetune_forecast_trainer.save_model("./dump/etth1/finetune_forecast/patchtsmixer_finetune_forecast_etth1")

In [47]:
# get train dataloader
eval_dataloader = finetune_forecast_trainer.get_eval_dataloader()
eval_dataloader

<accelerate.data_loader.DataLoaderShard at 0x14a5ddd54640>

In [48]:
loaded_model = PatchTSMixerForForecasting.from_pretrained('./dump/etth1/finetune_forecast/patchtsmixer_finetune_forecast_etth1').to("cuda:0")

In [49]:
for X in eval_dataloader:
    print(X.keys())
    print(X["context_values"].shape, X["target_values"].shape)
    torch.save(X, "./dump/etth1/finetune_forecast/batch_forecast.pt")
    break

dict_keys(['context_values', 'target_values'])
torch.Size([1024, 512, 7]) torch.Size([1024, 96, 7])


In [50]:
torch.manual_seed(0)
loaded_model.eval()
with torch.no_grad():
    output_batch = loaded_model(X["context_values"])
print(output_batch.prediction_logits[0, :1, :7])

tensor([[ 0.4179, -0.0815,  0.4532,  0.7133, -0.3059, -2.3659,  0.2807]],
       device='cuda:0')


Copy the above values in a pytest and use the dumped model to verify (optional)

## 4. Use pretrained model to finetune for a prediction task

In [67]:
# We can either provide the forecast channel indices during pretraining
# Or, we can update the config and pass it again
pretrain_config.update({"forecast_channel_indices": [3,5]})
finetune_prediction_model = PatchTSMixerForForecasting.from_pretrained('./dump/etth1/pretrain/patchtsmixer_pretrained_etth1', config=pretrain_config)


finetune_prediction_args = TrainingArguments(
        output_dir='./dump/etth1/finetune_prediction/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/etth1/finetune_prediction/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

finetune_prediction_trainer = Trainer(
    model=finetune_prediction_model,
    args=finetune_prediction_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)

Some weights of the model checkpoint at ./dump/etth1/pretrain/patchtsmixer_pretrained_etth1 were not used when initializing PatchTSMixerForForecasting: ['head.head.base_pt_block.1.weight', 'head.head.base_pt_block.1.bias']
- This IS expected if you are initializing PatchTSMixerForForecasting from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing PatchTSMixerForForecasting from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of PatchTSMixerForForecasting were not initialized from the model checkpoint at ./dump/etth1/pretrain/patchtsmixer_pretrained_etth1 and are newly initialized: ['head.head.base_forecast_block.1.bias', 'head.head.base_forecast_block.1.weight']
You should probably TRAIN this mode

In [69]:
finetune_prediction_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4711,1.063311
2,0.3817,0.968746
3,0.3563,0.951072
4,0.3403,0.910686
5,0.3322,0.88985
6,0.3239,0.905374
7,0.3193,0.894543
8,0.3173,0.90042
9,0.3103,0.876824
10,0.3053,0.876611


TrainOutput(global_step=13104, training_loss=0.2912400167649191, metrics={'train_runtime': 221.7843, 'train_samples_per_second': 3621.988, 'train_steps_per_second': 113.624, 'total_flos': 1761876202684416.0, 'train_loss': 0.2912400167649191, 'epoch': 52.0})

In [70]:
finetune_prediction_trainer.evaluate(dset_test)

{'eval_loss': 0.14809785783290863,
 'eval_runtime': 0.171,
 'eval_samples_per_second': 16281.896,
 'eval_steps_per_second': 17.539,
 'epoch': 52.0}

In [71]:
finetune_prediction_trainer.evaluate(dset_val)

{'eval_loss': 0.8272998332977295,
 'eval_runtime': 0.1701,
 'eval_samples_per_second': 16376.718,
 'eval_steps_per_second': 17.641,
 'epoch': 52.0}

In [72]:
finetune_prediction_trainer.save_model("./dump/etth1/finetune_prediction/patchtsmixer_finetune_prediction_etth1")

In [73]:
# get train dataloader
eval_dataloader = finetune_prediction_trainer.get_eval_dataloader()
eval_dataloader

<accelerate.data_loader.DataLoaderShard at 0x14a5dddca670>

In [74]:
loaded_model = PatchTSMixerForForecasting.from_pretrained('./dump/etth1/finetune_prediction/patchtsmixer_finetune_prediction_etth1').to("cuda:0")

In [75]:
for X in eval_dataloader:
    print(X.keys())
    print(X["context_values"].shape, X["target_values"].shape)
    torch.save(X, "./dump/etth1/finetune_prediction/batch_prediction.pt")
    break

dict_keys(['context_values', 'target_values'])
torch.Size([1024, 512, 7]) torch.Size([1024, 96, 7])


In [76]:
torch.manual_seed(0)
loaded_model.eval()
with torch.no_grad():
    output_batch = loaded_model(X["context_values"])
print(output_batch.prediction_logits.shape)
print(output_batch.prediction_logits[0, :1, :7])

torch.Size([1024, 96, 2])
tensor([[ 0.6744, -2.3873]], device='cuda:0')


Note that, the output has only 2 channels

## 5. Register model under Auto Classes and load the model

In [77]:
from transformers import AutoConfig, AutoModel, AutoModelForPreTraining

In [78]:
# AutoConfig.register("patchtsmixer", PatchTSMixerConfig)
AutoModelForPreTraining.register(PatchTSMixerConfig, PatchTSMixerForMaskPretraining)

In [79]:
auto_model = AutoModelForPreTraining.from_pretrained('./dump/etth1/pretrain/patchtsmixer_pretrained_etth1')

In [80]:
# auto_model(test_data.to('mps')).prediction_logits
auto_model(dset_test.__getitem__(0)['context_values'].reshape(1,seq_len,n_features)).prediction_logits

tensor([[[[ 0.1099,  0.1911,  0.2393,  ...,  0.4792,  0.5710,  0.6027],
          [ 0.2099,  0.0878, -0.0625,  ...,  0.2933,  0.3861,  0.5001],
          [ 0.5203,  0.5468,  0.5345,  ..., -1.1861, -1.0121, -0.7806],
          ...,
          [ 0.8465,  0.8854,  0.8678,  ..., -2.5289, -2.1887, -1.7487],
          [-0.3460, -0.1029,  0.1233,  ...,  0.8395,  0.7848,  0.6829],
          [-0.2900, -0.4114, -0.5374,  ...,  0.3404,  0.3283,  0.3139]],

         [[-1.5066, -1.5974, -1.7086,  ..., -0.9671, -0.9586, -0.9059],
          [-0.7273, -0.7845, -0.8463,  ..., -1.9130, -1.5215, -1.1196],
          [-0.6896, -0.7026, -0.7588,  ..., -1.3015, -1.5143, -1.5829],
          ...,
          [ 1.0302,  1.0833,  1.0637,  ...,  1.5262,  1.1418,  0.7456],
          [-0.3708, -0.4422, -0.4905,  ...,  0.9482,  0.7597,  0.6537],
          [ 0.7262,  0.7794,  0.7970,  ...,  0.0542,  0.1150,  0.1876]],

         [[ 0.0588,  0.1476,  0.2144,  ...,  0.5340,  0.5515,  0.5221],
          [ 0.1438,  0.0114, -