# PatchTSMixer workflow examples on ETTM dataset

In [1]:
from transformers import PatchTSMixerConfig, Trainer, TrainingArguments
import torch
from torch.utils.data import random_split
from torch.utils.data import Dataset
import os
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


## Data prep

Generate and prepare dummy data to test the pretraining.

In [2]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

class Dataset_ETT_minute(Dataset):
    def __init__(self, root_path='/dccstor/dnn_forecasting/FM/data/ETDataset/ETT-small/', split='train', size=None,
                 features='M', data_path='ETTm2.csv',
                 target='OT', scale=True, timeenc=0, freq='t',
                 use_time_features=False
                 ):
        # size [seq_len, label_len, pred_len]
        # info
        if size == None:
            self.seq_len = 24 * 4 * 4
            self.label_len = 24 * 4
            self.pred_len = 24 * 4
        else:
            self.seq_len = size[0]
            self.label_len = size[1]
            self.pred_len = size[2]
        # init
        assert split in ['train', 'test', 'val']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[split]

        self.features = features
        self.target = target
        self.scale = scale
        self.timeenc = timeenc
        self.freq = freq
        self.use_time_features = use_time_features

        self.root_path = root_path
        self.data_path = data_path
        self.__read_data__()

    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path,
                                          self.data_path))

        border1s = [0, 12 * 30 * 24 * 4 - self.seq_len, 12 * 30 * 24 * 4 + 4 * 30 * 24 * 4 - self.seq_len]
        border2s = [12 * 30 * 24 * 4, 12 * 30 * 24 * 4 + 4 * 30 * 24 * 4, 12 * 30 * 24 * 4 + 8 * 30 * 24 * 4]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]

        if self.features == 'M' or self.features == 'MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features == 'S':
            df_data = df_raw[[self.target]]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values

        df_stamp = df_raw[['date']][border1:border2]
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        if self.timeenc == 0:
            df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
            df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
            df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1)
            df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
            df_stamp['minute'] = df_stamp.date.apply(lambda row: row.minute, 1)
            df_stamp['minute'] = df_stamp.minute.map(lambda x: x // 15)
            data_stamp = df_stamp.drop(['date'], 1).values
        elif self.timeenc == 1:
            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
            data_stamp = data_stamp.transpose(1, 0)

        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp

    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]
        seq_x_mark = self.data_stamp[s_begin:s_end]
        seq_y_mark = self.data_stamp[r_begin:r_end]

#         if self.use_time_features: return _torch(seq_x, seq_y, seq_x_mark, seq_y_mark)
#         else: return _torch(seq_x, seq_y)
        return {"context_values": torch.Tensor(seq_x), "target_values": torch.Tensor(seq_y)}

    def __len__(self):
        return len(self.data_x) - self.seq_len - self.pred_len + 1

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)

In [3]:
FORECAST_LEN = 96
n_features = 7
SEQ_LEN = 512
seq_len = SEQ_LEN
patch_len = 16
stride = patch_len
SIZE = [SEQ_LEN, 0, FORECAST_LEN]

In [4]:
dset_train = Dataset_ETT_minute(split="train", size=SIZE)
dset_val = Dataset_ETT_minute(split="val", size=SIZE)
dset_test = Dataset_ETT_minute(split="test", size=SIZE)

  data_stamp = df_stamp.drop(['date'], 1).values
  data_stamp = df_stamp.drop(['date'], 1).values
  data_stamp = df_stamp.drop(['date'], 1).values


In [5]:
dd=dset_val.__getitem__(0)
dd["context_values"].shape, dd["target_values"].shape

(torch.Size([512, 7]), torch.Size([96, 7]))

In [6]:
# num_patches should be (no need to specify)
num_patches = seq_len//patch_len
num_patches

32

## 1. Directly train a `PatchTSMixer` forecasting model, and evaluate the test performance

In [7]:
from transformers import PatchTSMixerForForecasting

forecast_config = PatchTSMixerConfig(
    in_channels=n_features,
    seq_len=SEQ_LEN,
    patch_len=patch_len,
    stride=stride,
    num_features=48,
    num_layers=2,
    dropout=0.5,
    mode="common_channel",
    revin=True,
    expansion_factor=3,
    head_dropout=0.7,
    forecast_len=FORECAST_LEN
)

forecast_model = PatchTSMixerForForecasting(forecast_config)

forecast_args = TrainingArguments(
        output_dir='./dump/ettm2/direct_forecast/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/ettm2/direct_forecast/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

from transformers import EarlyStoppingCallback

# Create the early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10,  # Number of epochs with no improvement after which to stop
    early_stopping_threshold=0.0001,  # Minimum improvement required to consider as improvement
)

forecast_trainer = Trainer(
    model=forecast_model,
    args=forecast_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)


In [8]:
forecast_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2798,0.117871
2,0.2309,0.11662
3,0.2223,0.116322
4,0.2163,0.115694
5,0.2126,0.115662
6,0.2096,0.115555
7,0.2071,0.115166
8,0.2052,0.116261
9,0.2029,0.115753
10,0.2018,0.115477


TrainOutput(global_step=18054, training_loss=0.2100857529408303, metrics={'train_runtime': 193.1515, 'train_samples_per_second': 17578.43, 'train_steps_per_second': 549.827, 'total_flos': 2434564886347776.0, 'train_loss': 0.2100857529408303, 'epoch': 17.0})

In [9]:
forecast_trainer.evaluate(dset_test)

{'eval_loss': 0.1638096570968628,
 'eval_runtime': 0.5798,
 'eval_samples_per_second': 19706.517,
 'eval_steps_per_second': 20.698,
 'epoch': 17.0}

## 2. Pretrain a `PatchTSMixer` model with HF trainer

In [10]:
from transformers import PatchTSMixerForPretraining

pretrain_config = PatchTSMixerConfig(
    in_channels=n_features,
    seq_len=SEQ_LEN,
    patch_len=patch_len,
    stride=stride,
    num_features=48,
    num_layers=2,
    dropout=0.5,
    mode="common_channel",
    revin=True,
    expansion_factor=3,
    head_dropout=0.7,
    forecast_len=FORECAST_LEN
)

pretrain_model = PatchTSMixerForPretraining(pretrain_config)

pretrain_args = TrainingArguments(
        output_dir='./dump/ettm2/pretrain/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/ettm2/pretrain/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

pretrain_trainer = Trainer(
    model=pretrain_model,
    args=pretrain_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)

In [11]:
pretrain_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.747,0.477664
2,0.5221,0.410036
3,0.4788,0.393636
4,0.4628,0.387728
5,0.4552,0.384844
6,0.4508,0.381542
7,0.4485,0.378573
8,0.4464,0.379075
9,0.4446,0.377123
10,0.4434,0.376751


TrainOutput(global_step=77526, training_loss=0.4298172266715864, metrics={'train_runtime': 842.6717, 'train_samples_per_second': 4029.208, 'train_steps_per_second': 126.028, 'total_flos': 2631698720587776.0, 'train_loss': 0.4298172266715864, 'epoch': 73.0})

In [12]:
pretrain_trainer.evaluate(dset_test)

{'eval_loss': 0.3604498505592346,
 'eval_runtime': 0.4458,
 'eval_samples_per_second': 25626.354,
 'eval_steps_per_second': 26.916,
 'epoch': 73.0}

### Save batch and output for pytests (optional)

In [13]:
# get train dataloader
eval_dataloader = pretrain_trainer.get_eval_dataloader()
eval_dataloader

<accelerate.data_loader.DataLoaderShard at 0x14dcae3d33a0>

In [14]:
for X in eval_dataloader:
    print(X.keys())
    print(X["context_values"].shape)
    torch.save(X, "./dump/ettm2/pretrain/batch.pt")
    break

dict_keys(['context_values'])
torch.Size([1024, 512, 7])


### Saving and loading the pretrained model

In [15]:
pretrain_trainer.save_model("./dump/ettm2/pretrain/patchtsmixer_pretrained_ettm2")

In [16]:
loaded_model = PatchTSMixerForPretraining.from_pretrained('./dump/ettm2/pretrain/patchtsmixer_pretrained_ettm2').to("cuda")

In [17]:
torch.manual_seed(0)
loaded_model.eval()
with torch.no_grad():
    d_out = loaded_model.forward(X['context_values'].reshape(1024,seq_len,n_features).to("cuda"))
print(d_out.prediction_logits[0, :7, :1, :1])

tensor([[[1.1514]],

        [[1.0473]],

        [[0.9348]],

        [[0.8902]],

        [[0.6631]],

        [[0.8166]],

        [[1.6246]]], device='cuda:0')


Copy the above values in a pytest and use the dumped model to verify (optional)

## 3. Use the pretrained model (step 2) to finetune for a forecasting task

In [18]:
finetune_forecast_model = PatchTSMixerForForecasting.from_pretrained('./dump/ettm2/pretrain/patchtsmixer_pretrained_ettm2')

finetune_forecast_args = TrainingArguments(
        output_dir='./dump/ettm2/finetune_forecast/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/ettm2/finetune_forecast/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

finetune_forecast_trainer = Trainer(
    model=finetune_forecast_model,
    args=finetune_forecast_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)

Some weights of PatchTSMixerForForecasting were not initialized from the model checkpoint at ./dump/ettm2/pretrain/patchtsmixer_pretrained_ettm2 and are newly initialized: ['head.head.base_forecast_block.1.bias', 'head.head.base_forecast_block.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
finetune_forecast_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.284,0.119586
2,0.2332,0.115957
3,0.2217,0.115903
4,0.2154,0.114983
5,0.2106,0.114495
6,0.2064,0.114195
7,0.2035,0.11395
8,0.1996,0.114402
9,0.1957,0.115237
10,0.1932,0.113769


TrainOutput(global_step=21240, training_loss=0.20082706228712408, metrics={'train_runtime': 229.4165, 'train_samples_per_second': 14799.717, 'train_steps_per_second': 462.913, 'total_flos': 2864193983938560.0, 'train_loss': 0.20082706228712408, 'epoch': 20.0})

In [20]:
finetune_forecast_trainer.evaluate(dset_test)

{'eval_loss': 0.16320058703422546,
 'eval_runtime': 0.4929,
 'eval_samples_per_second': 23178.925,
 'eval_steps_per_second': 24.345,
 'epoch': 20.0}

### Save/Load and dump outputs for pytest (optional)

In [21]:
finetune_forecast_trainer.save_model("./dump/ettm2/finetune_forecast/patchtsmixer_finetune_forecast_ettm2")

In [22]:
# get train dataloader
eval_dataloader = finetune_forecast_trainer.get_eval_dataloader()
eval_dataloader

<accelerate.data_loader.DataLoaderShard at 0x14dcae39dd90>

In [23]:
loaded_model = PatchTSMixerForForecasting.from_pretrained('./dump/ettm2/finetune_forecast/patchtsmixer_finetune_forecast_ettm2').to("cuda:0")

In [24]:
for X in eval_dataloader:
    print(X.keys())
    print(X["context_values"].shape, X["target_values"].shape)
    torch.save(X, "./dump/ettm2/finetune_forecast/batch_forecast.pt")
    break

dict_keys(['context_values', 'target_values'])
torch.Size([1024, 512, 7]) torch.Size([1024, 96, 7])


In [25]:
torch.manual_seed(0)
loaded_model.eval()
with torch.no_grad():
    output_batch = loaded_model(X["context_values"])
print(output_batch.prediction_logits[0, :1, :7])

tensor([[-0.9803, -0.8590, -0.4304,  0.0600, -2.0267, -0.1246,  0.8570]],
       device='cuda:0')


Copy the above values in a pytest and use the dumped model to verify (optional)

## 4. Use pretrained model to finetune for a prediction task

In [26]:
# We can either provide the forecast channel indices during pretraining
# Or, we can update the config and pass it again
pretrain_config.update({"forecast_channel_indices": [3,5]})
finetune_prediction_model = PatchTSMixerForForecasting.from_pretrained('./dump/ettm2/pretrain/patchtsmixer_pretrained_ettm2', config=pretrain_config)


finetune_prediction_args = TrainingArguments(
        output_dir='./dump/ettm2/finetune_prediction/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/ettm2/finetune_prediction/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

finetune_prediction_trainer = Trainer(
    model=finetune_prediction_model,
    args=finetune_prediction_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)

Some weights of PatchTSMixerForForecasting were not initialized from the model checkpoint at ./dump/ettm2/pretrain/patchtsmixer_pretrained_ettm2 and are newly initialized: ['head.head.base_forecast_block.1.bias', 'head.head.base_forecast_block.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
finetune_prediction_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3428,0.133379
2,0.2784,0.128137
3,0.2615,0.127083
4,0.2462,0.126059
5,0.2364,0.125614
6,0.2299,0.124916
7,0.2241,0.124585
8,0.2204,0.12536
9,0.2169,0.125464
10,0.2153,0.124929


TrainOutput(global_step=18054, training_loss=0.230753371125772, metrics={'train_runtime': 201.3603, 'train_samples_per_second': 16861.811, 'train_steps_per_second': 527.413, 'total_flos': 2434564886347776.0, 'train_loss': 0.230753371125772, 'epoch': 17.0})

In [28]:
finetune_prediction_trainer.evaluate(dset_test)

{'eval_loss': 0.20171673595905304,
 'eval_runtime': 0.5134,
 'eval_samples_per_second': 22251.699,
 'eval_steps_per_second': 23.372,
 'epoch': 17.0}

In [29]:
finetune_prediction_trainer.evaluate(dset_val)

{'eval_loss': 0.12458471208810806,
 'eval_runtime': 0.5215,
 'eval_samples_per_second': 21909.806,
 'eval_steps_per_second': 23.012,
 'epoch': 17.0}

In [30]:
finetune_prediction_trainer.save_model("./dump/ettm2/finetune_prediction/patchtsmixer_finetune_prediction_ettm2")

In [31]:
# get train dataloader
eval_dataloader = finetune_prediction_trainer.get_eval_dataloader()
eval_dataloader

<accelerate.data_loader.DataLoaderShard at 0x14dcadef0f70>

In [32]:
loaded_model = PatchTSMixerForForecasting.from_pretrained('./dump/ettm2/finetune_prediction/patchtsmixer_finetune_prediction_ettm2').to("cuda:0")

In [33]:
for X in eval_dataloader:
    print(X.keys())
    print(X["context_values"].shape, X["target_values"].shape)
    torch.save(X, "./dump/ettm2/finetune_prediction/batch_prediction.pt")
    break

dict_keys(['context_values', 'target_values'])
torch.Size([1024, 512, 7]) torch.Size([1024, 96, 7])


In [34]:
torch.manual_seed(0)
loaded_model.eval()
with torch.no_grad():
    output_batch = loaded_model(X["context_values"])
print(output_batch.prediction_logits.shape)
print(output_batch.prediction_logits[0, :1, :7])

torch.Size([1024, 96, 2])
tensor([[ 0.0500, -0.1313]], device='cuda:0')


Note that, the output has only 2 channels

## 5. Register model under Auto Classes and load the model

In [35]:
from transformers import AutoConfig, AutoModel, AutoModelForPreTraining

In [36]:
# AutoConfig.register("patchtsmixer", PatchTSMixerConfig)
AutoModelForPreTraining.register(PatchTSMixerConfig, PatchTSMixerForPretraining)

In [37]:
auto_model = AutoModelForPreTraining.from_pretrained('./dump/ettm2/pretrain/patchtsmixer_pretrained_ettm2')

In [38]:
# auto_model(test_data.to('mps')).prediction_logits
auto_model(dset_test.__getitem__(0)['context_values'].reshape(1,seq_len,n_features)).prediction_logits

tensor([[[[ 1.3256e+00,  1.3209e+00,  1.3326e+00,  ...,  1.0987e+00,
            1.1460e+00,  1.1975e+00],
          [ 1.3033e+00,  1.3028e+00,  1.2580e+00,  ...,  9.0834e-01,
            8.8601e-01,  8.6099e-01],
          [ 5.1945e-01,  5.4233e-01,  5.7474e-01,  ...,  6.1828e-01,
            6.4622e-01,  6.5892e-01],
          ...,
          [ 6.9123e-01,  7.5057e-01,  7.9863e-01,  ...,  8.5491e-01,
            8.3613e-01,  7.7443e-01],
          [ 1.0362e+00,  1.0745e+00,  1.0930e+00,  ...,  1.0928e-01,
            9.6637e-02,  7.7504e-02],
          [ 6.7881e-01,  6.9865e-01,  7.1106e-01,  ...,  6.5421e-01,
            6.3993e-01,  6.2467e-01]],

         [[ 9.9452e-02,  1.0485e-01,  1.0052e-01,  ...,  1.3974e-01,
            1.2174e-01,  9.8005e-02],
          [ 1.1278e-01,  1.1193e-01,  1.0288e-01,  ...,  1.7026e-01,
            1.5399e-01,  1.3636e-01],
          [ 1.3699e-01,  1.3539e-01,  1.2659e-01,  ...,  1.7387e-01,
            1.5944e-01,  1.3994e-01],
          ...,
     