# PatchTSMixer workflow examples on ETTH dataset

In [1]:
from transformers import PatchTSMixerConfig, Trainer, TrainingArguments
import torch
from torch.utils.data import random_split
from torch.utils.data import Dataset
import os
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


## Data prep

Generate and prepare dummy data to test the pretraining.

In [2]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
class ETTDataset(Dataset):
    def __init__(self, root_path='/dccstor/dnn_forecasting/FM/data/ETDataset/ETT-small/', data_file='ETTh2.csv', 
                 seq_len=128, pred_len=32,
                 split='train',                  
                 scale=True
                 ):
        
        self.seq_len = seq_len
        self.pred_len = pred_len
        # init
        assert split in ['train', 'test', 'val']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[split]
                
        self.scale = scale                        

        self.root_path = root_path
        self.data_file = data_file
        self.__read_data__()

    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path, self.data_file))

        border1s = [0, 12 * 30 * 24 - self.seq_len, 12 * 30 * 24 + 4 * 30 * 24 - self.seq_len]
        border2s = [12 * 30 * 24, 12 * 30 * 24 + 4 * 30 * 24, 12 * 30 * 24 + 8 * 30 * 24]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]
        
        cols_data = df_raw.columns[1:]
        df_data = df_raw[cols_data]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values
        
        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]        

    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.seq_len
        r_begin = s_end
        r_end = r_begin + self.pred_len 

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]        
        return {"context_values": torch.Tensor(seq_x), "target_values": torch.Tensor(seq_y)}

    def __len__(self):
        return len(self.data_x) - self.seq_len - self.pred_len + 1

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)


In [3]:
FORECAST_LEN = 96
n_features = 7
SEQ_LEN = 512
seq_len = SEQ_LEN
patch_len = 16
stride = patch_len

In [4]:
dset_train = ETTDataset(split="train", seq_len=SEQ_LEN, pred_len=FORECAST_LEN)
dset_val = ETTDataset(split="val", seq_len=SEQ_LEN, pred_len=FORECAST_LEN)
dset_test = ETTDataset(split="test", seq_len=SEQ_LEN, pred_len=FORECAST_LEN)

In [5]:
dd=dset_val.__getitem__(0)
dd["context_values"].shape, dd["target_values"].shape

(torch.Size([512, 7]), torch.Size([96, 7]))

In [6]:
# num_patches should be (no need to specify)
num_patches = seq_len//patch_len
num_patches

32

## 1. Directly train a `PatchTSMixer` forecasting model, and evaluate the test performance

In [7]:
from transformers import PatchTSMixerForForecasting

forecast_config = PatchTSMixerConfig(
    in_channels=n_features,
    seq_len=SEQ_LEN,
    patch_len=patch_len,
    stride=stride,
    num_features=48,
    num_layers=2,
    dropout=0.5,
    mode="common_channel",
    revin=True,
    expansion_factor=3,
    head_dropout=0.7,
    forecast_len=FORECAST_LEN
)

forecast_model = PatchTSMixerForForecasting(forecast_config)

forecast_args = TrainingArguments(
        output_dir='./dump/etth2/direct_forecast/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/etth2/direct_forecast/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

from transformers import EarlyStoppingCallback

# Create the early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10,  # Number of epochs with no improvement after which to stop
    early_stopping_threshold=0.0001,  # Minimum improvement required to consider as improvement
)

forecast_trainer = Trainer(
    model=forecast_model,
    args=forecast_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)


In [8]:
forecast_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5816,0.231726
2,0.4844,0.220617
3,0.4541,0.21554
4,0.4369,0.211976
5,0.4281,0.210721
6,0.4208,0.207517
7,0.4155,0.208131
8,0.4122,0.207082
9,0.4086,0.208413
10,0.4046,0.208888


TrainOutput(global_step=4536, training_loss=0.4216494383635344, metrics={'train_runtime': 47.2475, 'train_samples_per_second': 17001.967, 'train_steps_per_second': 533.362, 'total_flos': 609880224006144.0, 'train_loss': 0.4216494383635344, 'epoch': 18.0})

In [9]:
forecast_trainer.evaluate(dset_test)

{'eval_loss': 0.2684691846370697,
 'eval_runtime': 0.0813,
 'eval_samples_per_second': 34255.332,
 'eval_steps_per_second': 36.9,
 'epoch': 18.0}

## 2. Pretrain a `PatchTSMixer` model with HF trainer

In [10]:
from transformers import PatchTSMixerForMaskPretraining

pretrain_config = PatchTSMixerConfig(
    in_channels=n_features,
    seq_len=SEQ_LEN,
    patch_len=patch_len,
    stride=stride,
    num_features=48,
    num_layers=2,
    dropout=0.5,
    mode="common_channel",
    revin=True,
    expansion_factor=3,
    head_dropout=0.7,
    forecast_len=FORECAST_LEN
)

pretrain_model = PatchTSMixerForMaskPretraining(pretrain_config)

pretrain_args = TrainingArguments(
        output_dir='./dump/etth2/pretrain/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/etth2/pretrain/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

pretrain_trainer = Trainer(
    model=pretrain_model,
    args=pretrain_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)

In [11]:
pretrain_trainer.train()

Epoch,Training Loss,Validation Loss
1,1.0587,0.763012
2,0.8242,0.643187
3,0.7315,0.578778
4,0.6745,0.539403
5,0.6409,0.512444
6,0.6155,0.494741
7,0.5967,0.479723
8,0.5853,0.465993
9,0.5754,0.455753
10,0.5675,0.447083


TrainOutput(global_step=19656, training_loss=0.5365301802918151, metrics={'train_runtime': 206.0018, 'train_samples_per_second': 3899.48, 'train_steps_per_second': 122.329, 'total_flos': 665284684087296.0, 'train_loss': 0.5365301802918151, 'epoch': 78.0})

In [12]:
pretrain_trainer.evaluate(dset_test)

{'eval_loss': 0.41572117805480957,
 'eval_runtime': 0.0766,
 'eval_samples_per_second': 36356.648,
 'eval_steps_per_second': 39.163,
 'epoch': 78.0}

### Save batch and output for pytests (optional)

In [13]:
# get train dataloader
eval_dataloader = pretrain_trainer.get_eval_dataloader()
eval_dataloader

<accelerate.data_loader.DataLoaderShard at 0x152ada876f10>

In [14]:
for X in eval_dataloader:
    print(X.keys())
    print(X["context_values"].shape)
    torch.save(X, "./dump/etth2/pretrain/batch.pt")
    break

dict_keys(['context_values'])
torch.Size([1024, 512, 7])


### Saving and loading the pretrained model

In [15]:
pretrain_trainer.save_model("./dump/etth2/pretrain/patchtsmixer_pretrained_etth2")

In [16]:
loaded_model = PatchTSMixerForMaskPretraining.from_pretrained('./dump/etth2/pretrain/patchtsmixer_pretrained_etth2').to("cuda")

In [17]:
torch.manual_seed(0)
loaded_model.eval()
with torch.no_grad():
    d_out = loaded_model.forward(X['context_values'].reshape(1024,seq_len,n_features).to("cuda"))
print(d_out.prediction_logits[0, :7, :1, :1])

tensor([[[ 0.0342]],

        [[-0.0571]],

        [[ 0.7122]],

        [[ 0.3961]],

        [[-0.8683]],

        [[-0.3770]],

        [[ 1.0956]]], device='cuda:0')


Copy the above values in a pytest and use the dumped model to verify (optional)

## 3. Use the pretrained model (step 2) to finetune for a forecasting task

In [18]:
finetune_forecast_model = PatchTSMixerForForecasting.from_pretrained('./dump/etth2/pretrain/patchtsmixer_pretrained_etth2')

finetune_forecast_args = TrainingArguments(
        output_dir='./dump/etth2/finetune_forecast/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/etth2/finetune_forecast/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

finetune_forecast_trainer = Trainer(
    model=finetune_forecast_model,
    args=finetune_forecast_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)

Some weights of PatchTSMixerForForecasting were not initialized from the model checkpoint at ./dump/etth2/pretrain/patchtsmixer_pretrained_etth2 and are newly initialized: ['head.head.base_forecast_block.1.bias', 'head.head.base_forecast_block.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
finetune_forecast_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5633,0.24643
2,0.4799,0.233743
3,0.4492,0.228402
4,0.4317,0.225942
5,0.4218,0.222442
6,0.4134,0.218763
7,0.4072,0.219262
8,0.4025,0.219313
9,0.3984,0.21978
10,0.395,0.222942


TrainOutput(global_step=6048, training_loss=0.4037793338614166, metrics={'train_runtime': 69.529, 'train_samples_per_second': 11553.456, 'train_steps_per_second': 362.439, 'total_flos': 813173632008192.0, 'train_loss': 0.4037793338614166, 'epoch': 24.0})

In [20]:
finetune_forecast_trainer.evaluate(dset_test)

{'eval_loss': 0.27261653542518616,
 'eval_runtime': 0.1176,
 'eval_samples_per_second': 23682.364,
 'eval_steps_per_second': 25.511,
 'epoch': 24.0}

### Save/Load and dump outputs for pytest (optional)

In [21]:
finetune_forecast_trainer.save_model("./dump/etth2/finetune_forecast/patchtsmixer_finetune_forecast_etth2")

In [22]:
# get train dataloader
eval_dataloader = finetune_forecast_trainer.get_eval_dataloader()
eval_dataloader

<accelerate.data_loader.DataLoaderShard at 0x152ad9e0cfa0>

In [23]:
loaded_model = PatchTSMixerForForecasting.from_pretrained('./dump/etth2/finetune_forecast/patchtsmixer_finetune_forecast_etth2').to("cuda:0")

In [24]:
for X in eval_dataloader:
    print(X.keys())
    print(X["context_values"].shape, X["target_values"].shape)
    torch.save(X, "./dump/etth2/finetune_forecast/batch_forecast.pt")
    break

dict_keys(['context_values', 'target_values'])
torch.Size([1024, 512, 7]) torch.Size([1024, 96, 7])


In [25]:
torch.manual_seed(0)
loaded_model.eval()
with torch.no_grad():
    output_batch = loaded_model(X["context_values"])
print(output_batch.prediction_logits[0, :1, :7])

tensor([[-0.9324, -0.8006, -0.3696,  0.1158, -1.9465, -0.1188,  0.8556]],
       device='cuda:0')


Copy the above values in a pytest and use the dumped model to verify (optional)

## 4. Use pretrained model to finetune for a prediction task

In [26]:
# We can either provide the forecast channel indices during pretraining
# Or, we can update the config and pass it again
pretrain_config.update({"forecast_channel_indices": [3,5]})
finetune_prediction_model = PatchTSMixerForForecasting.from_pretrained('./dump/etth2/pretrain/patchtsmixer_pretrained_etth2', config=pretrain_config)


finetune_prediction_args = TrainingArguments(
        output_dir='./dump/etth2/finetune_prediction/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/etth2/finetune_prediction/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

finetune_prediction_trainer = Trainer(
    model=finetune_prediction_model,
    args=finetune_prediction_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)

Some weights of PatchTSMixerForForecasting were not initialized from the model checkpoint at ./dump/etth2/pretrain/patchtsmixer_pretrained_etth2 and are newly initialized: ['head.head.base_forecast_block.1.bias', 'head.head.base_forecast_block.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
finetune_prediction_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.7007,0.265937
2,0.5834,0.252445
3,0.5422,0.247008
4,0.5129,0.246625
5,0.4934,0.241394
6,0.4817,0.238852
7,0.4756,0.237129
8,0.4678,0.237282
9,0.4602,0.238435
10,0.4528,0.238463


TrainOutput(global_step=15876, training_loss=0.40379803917755797, metrics={'train_runtime': 172.0215, 'train_samples_per_second': 4669.766, 'train_steps_per_second': 146.493, 'total_flos': 2134580784021504.0, 'train_loss': 0.40379803917755797, 'epoch': 63.0})

In [28]:
finetune_prediction_trainer.evaluate(dset_test)

{'eval_loss': 0.3648090660572052,
 'eval_runtime': 0.0856,
 'eval_samples_per_second': 32525.663,
 'eval_steps_per_second': 35.037,
 'epoch': 63.0}

In [29]:
finetune_prediction_trainer.evaluate(dset_val)

{'eval_loss': 0.21932737529277802,
 'eval_runtime': 0.0857,
 'eval_samples_per_second': 32503.309,
 'eval_steps_per_second': 35.013,
 'epoch': 63.0}

In [30]:
finetune_prediction_trainer.save_model("./dump/etth2/finetune_prediction/patchtsmixer_finetune_prediction_etth2")

In [31]:
# get train dataloader
eval_dataloader = finetune_prediction_trainer.get_eval_dataloader()
eval_dataloader

<accelerate.data_loader.DataLoaderShard at 0x152ad99804c0>

In [32]:
loaded_model = PatchTSMixerForForecasting.from_pretrained('./dump/etth2/finetune_prediction/patchtsmixer_finetune_prediction_etth2').to("cuda:0")

In [33]:
for X in eval_dataloader:
    print(X.keys())
    print(X["context_values"].shape, X["target_values"].shape)
    torch.save(X, "./dump/etth2/finetune_prediction/batch_prediction.pt")
    break

dict_keys(['context_values', 'target_values'])
torch.Size([1024, 512, 7]) torch.Size([1024, 96, 7])


In [34]:
torch.manual_seed(0)
loaded_model.eval()
with torch.no_grad():
    output_batch = loaded_model(X["context_values"])
print(output_batch.prediction_logits.shape)
print(output_batch.prediction_logits[0, :1, :7])

torch.Size([1024, 96, 2])
tensor([[-0.0119, -0.1193]], device='cuda:0')


Note that, the output has only 2 channels

## 5. Register model under Auto Classes and load the model

In [35]:
from transformers import AutoConfig, AutoModel, AutoModelForPreTraining

In [36]:
# AutoConfig.register("patchtsmixer", PatchTSMixerConfig)
AutoModelForPreTraining.register(PatchTSMixerConfig, PatchTSMixerForMaskPretraining)

In [37]:
auto_model = AutoModelForPreTraining.from_pretrained('./dump/etth2/pretrain/patchtsmixer_pretrained_etth2')

In [38]:
# auto_model(test_data.to('mps')).prediction_logits
auto_model(dset_test.__getitem__(0)['context_values'].reshape(1,seq_len,n_features)).prediction_logits

tensor([[[[ 0.5046,  0.5407,  0.6637,  ..., -0.5177, -0.3761, -0.3467],
          [-0.6514, -0.4129, -0.4830,  ...,  0.2448,  0.0976, -0.0072],
          [-0.3787, -0.4964, -0.5847,  ..., -0.3529, -0.4473, -0.3129],
          ...,
          [-0.0609, -0.1377, -0.2635,  ...,  0.2893,  0.2889,  0.2220],
          [ 0.4848,  0.7081,  0.8978,  ..., -0.3479, -0.1105, -0.3441],
          [-0.1160, -0.0881, -0.0425,  ...,  0.3956,  0.3633,  0.2666]],

         [[ 0.0827,  0.0317,  0.0927,  ...,  0.0183,  0.0377, -0.0851],
          [ 0.0046, -0.0397, -0.0711,  ...,  0.2167,  0.2533,  0.2675],
          [ 0.1585,  0.2033,  0.2134,  ..., -0.0770, -0.1297, -0.1355],
          ...,
          [ 0.1645,  0.2196,  0.2308,  ..., -0.0310, -0.0756, -0.0950],
          [ 0.0134,  0.0433,  0.0939,  ...,  0.3673,  0.8900,  0.4729],
          [ 0.0557,  0.0191, -0.0209,  ...,  0.3192,  0.3686,  0.3812]],

         [[ 0.3790,  0.3740,  0.4299,  ..., -0.7452, -0.5525, -0.4198],
          [-0.8011, -0.5474, -