# PatchTSMixer workflow examples on ETTM dataset

In [1]:
from transformers import PatchTSMixerConfig, Trainer, TrainingArguments
import torch
from torch.utils.data import random_split
from torch.utils.data import Dataset
import os
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


## Data prep

Generate and prepare dummy data to test the pretraining.

In [2]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

class Dataset_ETT_minute(Dataset):
    def __init__(self, root_path='/dccstor/dnn_forecasting/FM/data/ETDataset/ETT-small/', split='train', size=None,
                 features='M', data_path='ETTm1.csv',
                 target='OT', scale=True, timeenc=0, freq='t',
                 use_time_features=False
                 ):
        # size [seq_len, label_len, pred_len]
        # info
        if size == None:
            self.seq_len = 24 * 4 * 4
            self.label_len = 24 * 4
            self.pred_len = 24 * 4
        else:
            self.seq_len = size[0]
            self.label_len = size[1]
            self.pred_len = size[2]
        # init
        assert split in ['train', 'test', 'val']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[split]

        self.features = features
        self.target = target
        self.scale = scale
        self.timeenc = timeenc
        self.freq = freq
        self.use_time_features = use_time_features

        self.root_path = root_path
        self.data_path = data_path
        self.__read_data__()

    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = pd.read_csv(os.path.join(self.root_path,
                                          self.data_path))

        border1s = [0, 12 * 30 * 24 * 4 - self.seq_len, 12 * 30 * 24 * 4 + 4 * 30 * 24 * 4 - self.seq_len]
        border2s = [12 * 30 * 24 * 4, 12 * 30 * 24 * 4 + 4 * 30 * 24 * 4, 12 * 30 * 24 * 4 + 8 * 30 * 24 * 4]
        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]

        if self.features == 'M' or self.features == 'MS':
            cols_data = df_raw.columns[1:]
            df_data = df_raw[cols_data]
        elif self.features == 'S':
            df_data = df_raw[[self.target]]

        if self.scale:
            train_data = df_data[border1s[0]:border2s[0]]
            self.scaler.fit(train_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values

        df_stamp = df_raw[['date']][border1:border2]
        df_stamp['date'] = pd.to_datetime(df_stamp.date)
        if self.timeenc == 0:
            df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1)
            df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1)
            df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1)
            df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
            df_stamp['minute'] = df_stamp.date.apply(lambda row: row.minute, 1)
            df_stamp['minute'] = df_stamp.minute.map(lambda x: x // 15)
            data_stamp = df_stamp.drop(['date'], 1).values
        elif self.timeenc == 1:
            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
            data_stamp = data_stamp.transpose(1, 0)

        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp

    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]
        seq_x_mark = self.data_stamp[s_begin:s_end]
        seq_y_mark = self.data_stamp[r_begin:r_end]

#         if self.use_time_features: return _torch(seq_x, seq_y, seq_x_mark, seq_y_mark)
#         else: return _torch(seq_x, seq_y)
        return {"context_values": torch.Tensor(seq_x), "target_values": torch.Tensor(seq_y)}

    def __len__(self):
        return len(self.data_x) - self.seq_len - self.pred_len + 1

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)

In [3]:
FORECAST_LEN = 96
n_features = 7
SEQ_LEN = 512
seq_len = SEQ_LEN
patch_len = 16
stride = patch_len
SIZE = [SEQ_LEN, 0, FORECAST_LEN]

In [4]:
dset_train = Dataset_ETT_minute(split="train", size=SIZE)
dset_val = Dataset_ETT_minute(split="val", size=SIZE)
dset_test = Dataset_ETT_minute(split="test", size=SIZE)

  data_stamp = df_stamp.drop(['date'], 1).values
  data_stamp = df_stamp.drop(['date'], 1).values
  data_stamp = df_stamp.drop(['date'], 1).values


In [5]:
dd=dset_val.__getitem__(0)
dd["context_values"].shape, dd["target_values"].shape

(torch.Size([512, 7]), torch.Size([96, 7]))

In [6]:
# num_patches should be (no need to specify)
num_patches = seq_len//patch_len
num_patches

32

## 1. Directly train a `PatchTSMixer` forecasting model, and evaluate the test performance

In [7]:
from transformers import PatchTSMixerForForecasting

forecast_config = PatchTSMixerConfig(
    in_channels=n_features,
    seq_len=SEQ_LEN,
    patch_len=patch_len,
    stride=stride,
    num_features=48,
    num_layers=2,
    dropout=0.5,
    mode="common_channel",
    revin=True,
    expansion_factor=3,
    head_dropout=0.7,
    forecast_len=FORECAST_LEN
)

forecast_model = PatchTSMixerForForecasting(forecast_config)

forecast_args = TrainingArguments(
        output_dir='./dump/ettm1/direct_forecast/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/ettm1/direct_forecast/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

from transformers import EarlyStoppingCallback

# Create the early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10,  # Number of epochs with no improvement after which to stop
    early_stopping_threshold=0.0001,  # Minimum improvement required to consider as improvement
)

forecast_trainer = Trainer(
    model=forecast_model,
    args=forecast_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)


In [8]:
forecast_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3377,0.405758
2,0.2839,0.395581
3,0.2769,0.388832
4,0.2735,0.389827
5,0.2713,0.387664
6,0.2697,0.382318
7,0.268,0.381695
8,0.2667,0.380462
9,0.2653,0.380906
10,0.2643,0.380674


TrainOutput(global_step=56286, training_loss=0.2560363618803063, metrics={'train_runtime': 947.3042, 'train_samples_per_second': 3584.171, 'train_steps_per_second': 112.108, 'total_flos': 7590114057437184.0, 'train_loss': 0.2560363618803063, 'epoch': 53.0})

In [9]:
forecast_trainer.evaluate(dset_test)

{'eval_loss': 0.2869480848312378,
 'eval_runtime': 0.731,
 'eval_samples_per_second': 15628.272,
 'eval_steps_per_second': 16.415,
 'epoch': 53.0}

## 2. Pretrain a `PatchTSMixer` model with HF trainer

In [10]:
from transformers import PatchTSMixerForPretraining

pretrain_config = PatchTSMixerConfig(
    in_channels=n_features,
    seq_len=SEQ_LEN,
    patch_len=patch_len,
    stride=stride,
    num_features=48,
    num_layers=2,
    dropout=0.5,
    mode="common_channel",
    revin=True,
    expansion_factor=3,
    head_dropout=0.7,
    forecast_len=FORECAST_LEN
)

pretrain_model = PatchTSMixerForPretraining(pretrain_config)

pretrain_args = TrainingArguments(
        output_dir='./dump/ettm1/pretrain/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/ettm1/pretrain/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

pretrain_trainer = Trainer(
    model=pretrain_model,
    args=pretrain_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)

In [11]:
pretrain_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.7454,0.423507
2,0.5107,0.356104
3,0.4652,0.339474
4,0.4484,0.333207
5,0.4409,0.33055
6,0.4361,0.326977
7,0.4331,0.323584
8,0.4302,0.321314
9,0.4284,0.317115
10,0.4252,0.31451


TrainOutput(global_step=106200, training_loss=0.3983063304491636, metrics={'train_runtime': 1777.9815, 'train_samples_per_second': 1909.637, 'train_steps_per_second': 59.731, 'total_flos': 3605066740531200.0, 'train_loss': 0.3983063304491636, 'epoch': 100.0})

In [12]:
pretrain_trainer.evaluate(dset_test)

{'eval_loss': 0.2903543710708618,
 'eval_runtime': 0.5352,
 'eval_samples_per_second': 21347.697,
 'eval_steps_per_second': 22.422,
 'epoch': 100.0}

### Save batch and output for pytests (optional)

In [13]:
# get train dataloader
eval_dataloader = pretrain_trainer.get_eval_dataloader()
eval_dataloader

<accelerate.data_loader.DataLoaderShard at 0x14c048dd9dc0>

In [14]:
for X in eval_dataloader:
    print(X.keys())
    print(X["context_values"].shape)
    torch.save(X, "./dump/ettm1/pretrain/batch.pt")
    break

dict_keys(['context_values'])
torch.Size([1024, 512, 7])


### Saving and loading the pretrained model

In [15]:
pretrain_trainer.save_model("./dump/ettm1/pretrain/patchtsmixer_pretrained_ettm1")

In [16]:
loaded_model = PatchTSMixerForPretraining.from_pretrained('./dump/ettm1/pretrain/patchtsmixer_pretrained_ettm1').to("cuda")

In [17]:
torch.manual_seed(0)
loaded_model.eval()
with torch.no_grad():
    d_out = loaded_model.forward(X['context_values'].reshape(1024,seq_len,n_features).to("cuda"))
print(d_out.prediction_logits[0, :7, :1, :1])

tensor([[[-0.8842]],

        [[-0.3338]],

        [[-0.8968]],

        [[-0.3681]],

        [[ 0.8460]],

        [[ 1.0702]],

        [[ 1.5765]]], device='cuda:0')


Copy the above values in a pytest and use the dumped model to verify (optional)

## 3. Use the pretrained model (step 2) to finetune for a forecasting task

In [18]:
finetune_forecast_model = PatchTSMixerForForecasting.from_pretrained('./dump/ettm1/pretrain/patchtsmixer_pretrained_ettm1')

finetune_forecast_args = TrainingArguments(
        output_dir='./dump/ettm1/finetune_forecast/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/ettm1/finetune_forecast/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

finetune_forecast_trainer = Trainer(
    model=finetune_forecast_model,
    args=finetune_forecast_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)

Some weights of PatchTSMixerForForecasting were not initialized from the model checkpoint at ./dump/ettm1/pretrain/patchtsmixer_pretrained_ettm1 and are newly initialized: ['head.head.base_forecast_block.1.weight', 'head.head.base_forecast_block.1.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
finetune_forecast_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3264,0.410178
2,0.2824,0.39829
3,0.2736,0.393878
4,0.2689,0.390991
5,0.2663,0.387785
6,0.2637,0.379523
7,0.2621,0.378832
8,0.2611,0.377267
9,0.2593,0.378713
10,0.2579,0.378195


TrainOutput(global_step=56286, training_loss=0.24735123676662907, metrics={'train_runtime': 946.8288, 'train_samples_per_second': 3585.97, 'train_steps_per_second': 112.164, 'total_flos': 7590114057437184.0, 'train_loss': 0.24735123676662907, 'epoch': 53.0})

In [20]:
finetune_forecast_trainer.evaluate(dset_test)

{'eval_loss': 0.28315261006355286,
 'eval_runtime': 0.7417,
 'eval_samples_per_second': 15403.62,
 'eval_steps_per_second': 16.179,
 'epoch': 53.0}

### Save/Load and dump outputs for pytest (optional)

In [21]:
finetune_forecast_trainer.save_model("./dump/ettm1/finetune_forecast/patchtsmixer_finetune_forecast_ettm1")

In [22]:
# get train dataloader
eval_dataloader = finetune_forecast_trainer.get_eval_dataloader()
eval_dataloader

<accelerate.data_loader.DataLoaderShard at 0x14c049361ee0>

In [23]:
loaded_model = PatchTSMixerForForecasting.from_pretrained('./dump/ettm1/finetune_forecast/patchtsmixer_finetune_forecast_ettm1').to("cuda:0")

In [24]:
for X in eval_dataloader:
    print(X.keys())
    print(X["context_values"].shape, X["target_values"].shape)
    torch.save(X, "./dump/ettm1/finetune_forecast/batch_forecast.pt")
    break

dict_keys(['context_values', 'target_values'])
torch.Size([1024, 512, 7]) torch.Size([1024, 96, 7])


In [25]:
torch.manual_seed(0)
loaded_model.eval()
with torch.no_grad():
    output_batch = loaded_model(X["context_values"])
print(output_batch.prediction_logits[0, :1, :7])

tensor([[ 0.5633,  0.0606,  0.6069,  0.8175,  0.0398, -2.1611,  0.4113]],
       device='cuda:0')


Copy the above values in a pytest and use the dumped model to verify (optional)

## 4. Use pretrained model to finetune for a prediction task

In [26]:
# We can either provide the forecast channel indices during pretraining
# Or, we can update the config and pass it again
pretrain_config.update({"forecast_channel_indices": [3,5]})
finetune_prediction_model = PatchTSMixerForForecasting.from_pretrained('./dump/ettm1/pretrain/patchtsmixer_pretrained_ettm1', config=pretrain_config)


finetune_prediction_args = TrainingArguments(
        output_dir='./dump/ettm1/finetune_prediction/checkpoint',
        overwrite_output_dir=True,
        learning_rate=0.0001,
        num_train_epochs=100,
        do_eval=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=1024,
        report_to='tensorboard',
        save_strategy='epoch',
        logging_strategy="epoch",
        save_total_limit=3,
        logging_dir='./dump/ettm1/finetune_prediction/logs',  # Make sure to specify a logging directory
        
        load_best_model_at_end=True,  # Load the best model when training ends
        metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
        greater_is_better=False,  # For loss
    )

finetune_prediction_trainer = Trainer(
    model=finetune_prediction_model,
    args=finetune_prediction_args,
    train_dataset=dset_train,
    eval_dataset=dset_val,
    callbacks=[early_stopping_callback]
)

Some weights of PatchTSMixerForForecasting were not initialized from the model checkpoint at ./dump/ettm1/pretrain/patchtsmixer_pretrained_ettm1 and are newly initialized: ['head.head.base_forecast_block.1.weight', 'head.head.base_forecast_block.1.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
finetune_prediction_trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2354,0.37238
2,0.2042,0.358782
3,0.1967,0.355695
4,0.1924,0.348001
5,0.1902,0.351776
6,0.1877,0.346249
7,0.1858,0.343175
8,0.1847,0.343602
9,0.1839,0.345523
10,0.1826,0.347197


TrainOutput(global_step=30798, training_loss=0.1831576220206885, metrics={'train_runtime': 526.8925, 'train_samples_per_second': 6444.009, 'train_steps_per_second': 201.559, 'total_flos': 4153081276710912.0, 'train_loss': 0.1831576220206885, 'epoch': 29.0})

In [28]:
finetune_prediction_trainer.evaluate(dset_test)

{'eval_loss': 0.12233199924230576,
 'eval_runtime': 0.7232,
 'eval_samples_per_second': 15798.143,
 'eval_steps_per_second': 16.593,
 'epoch': 29.0}

In [29]:
finetune_prediction_trainer.evaluate(dset_val)

{'eval_loss': 0.34136372804641724,
 'eval_runtime': 0.7282,
 'eval_samples_per_second': 15690.168,
 'eval_steps_per_second': 16.48,
 'epoch': 29.0}

In [30]:
finetune_prediction_trainer.save_model("./dump/ettm1/finetune_prediction/patchtsmixer_finetune_prediction_ettm1")

In [31]:
# get train dataloader
eval_dataloader = finetune_prediction_trainer.get_eval_dataloader()
eval_dataloader

<accelerate.data_loader.DataLoaderShard at 0x14c049217280>

In [32]:
loaded_model = PatchTSMixerForForecasting.from_pretrained('./dump/ettm1/finetune_prediction/patchtsmixer_finetune_prediction_ettm1').to("cuda:0")

In [33]:
for X in eval_dataloader:
    print(X.keys())
    print(X["context_values"].shape, X["target_values"].shape)
    torch.save(X, "./dump/ettm1/finetune_prediction/batch_prediction.pt")
    break

dict_keys(['context_values', 'target_values'])
torch.Size([1024, 512, 7]) torch.Size([1024, 96, 7])


In [34]:
torch.manual_seed(0)
loaded_model.eval()
with torch.no_grad():
    output_batch = loaded_model(X["context_values"])
print(output_batch.prediction_logits.shape)
print(output_batch.prediction_logits[0, :1, :7])

torch.Size([1024, 96, 2])
tensor([[ 0.8108, -2.2082]], device='cuda:0')


Note that, the output has only 2 channels

## 5. Register model under Auto Classes and load the model

In [35]:
from transformers import AutoConfig, AutoModel, AutoModelForPreTraining

In [36]:
# AutoConfig.register("patchtsmixer", PatchTSMixerConfig)
AutoModelForPreTraining.register(PatchTSMixerConfig, PatchTSMixerForPretraining)

In [37]:
auto_model = AutoModelForPreTraining.from_pretrained('./dump/ettm1/pretrain/patchtsmixer_pretrained_ettm1')

In [38]:
# auto_model(test_data.to('mps')).prediction_logits
auto_model(dset_test.__getitem__(0)['context_values'].reshape(1,seq_len,n_features)).prediction_logits

tensor([[[[ 0.1527,  0.2205,  0.2780,  ...,  0.3576,  0.3416,  0.3166],
          [ 0.2882,  0.3150,  0.3378,  ...,  0.7706,  0.7765,  0.7621],
          [ 0.8379,  0.8387,  0.8289,  ...,  0.7195,  0.6866,  0.6433],
          ...,
          [-1.4859, -1.4634, -1.4249,  ..., -0.9950, -1.0377, -1.0746],
          [-0.4655, -0.3501, -0.1981,  ...,  0.3212,  0.3111,  0.3068],
          [ 0.2615,  0.2717,  0.2818,  ...,  0.3389,  0.3420,  0.3430]],

         [[-0.8495, -0.8907, -0.9494,  ..., -1.0969, -1.0805, -1.0625],
          [-1.2213, -1.2126, -1.1874,  ...,  0.4999,  0.5721,  0.6187],
          [ 0.7422,  0.7223,  0.7173,  ...,  0.7334,  0.7324,  0.7170],
          ...,
          [-0.3005, -0.3168, -0.3259,  ..., -0.6412, -0.6928, -0.7043],
          [-0.8660, -0.9424, -1.0336,  ..., -1.4231, -1.3920, -1.3620],
          [-0.7871, -0.7731, -0.7394,  ...,  0.0594,  0.0977,  0.1063]],

         [[ 0.2057,  0.2584,  0.2959,  ...,  0.3389,  0.3248,  0.2995],
          [ 0.3061,  0.3338,  