# Throughput testing

During development of new models it is often useful to test the throughput of the data pipeline. To enable this, ConfigILM provides a ThroughputTest_DataModule and a corresponding ThroughputTestDataset. These datasets don't load any actual data, but instead generate a single dummy sample during initialization and return it for each call to `__getitem__()`. The fake length of the dataset can be set with the num_samples parameter.

To run the throughput test we first create the model and then pass the respective DataModule to the trainer. For more details on creating the model see [the page on VQA model creation](vqa.ipynb). The code here is almost identical with some reduced parts.

In [None]:
# import packages
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
from torch import optim

from configilm import ConfigILM

class LitVQAEncoder(pl.LightningModule):
    """
    Wrapper around a pytorch module, allowing this module to be used in automatic
    training with pytorch lightning.
    Among other things, the wrapper allows us to do automatic training and removes the
    need to manage data on different devices (e.g. GPU and CPU).
    """
    def __init__(
        self,
        config: ConfigILM.ILMConfiguration,
        lr: float = 1e-3,
    ):
        super().__init__()
        self.lr = lr
        self.config = config
        self.model = ConfigILM.ConfigILM(config)
        self.val_output_list = []
        self.test_output_list = []

    def _disassemble_batch(self, batch):
        images, questions, labels = batch
        # transposing tensor, needed for Huggingface-Dataloader combination
        questions = torch.tensor(
            [x.tolist() for x in questions], device=self.device
        ).T.int()
        return (images, questions), labels

    def training_step(self, batch, batch_idx):
        x, y = self._disassemble_batch(batch)
        x_hat = self.model(x)
        loss = F.binary_cross_entropy_with_logits(x_hat, y)
        self.log("train/loss", loss)
        return {"loss": loss}

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.lr, weight_decay=0.01)
        return optimizer

    # ============== NON-MANDATORY-FUNCTION ===============

    def validation_step(self, batch, batch_idx):
        x, y = self._disassemble_batch(batch)
        x_hat = self.model(x)
        loss = F.binary_cross_entropy_with_logits(x_hat, y)
        self.val_output_list += [{"loss": loss, "outputs": x_hat, "labels": y}]

    def on_validation_epoch_start(self):
        super().on_validation_epoch_start()
        self.val_output_list = []

    def on_validation_epoch_end(self):
        avg_loss = torch.stack([x["loss"] for x in self.val_output_list]).mean()
        self.log("val/loss", avg_loss)

    def test_step(self, batch, batch_idx):
        x, y = self._disassemble_batch(batch)
        x_hat = self.model(x)
        loss = F.binary_cross_entropy_with_logits(x_hat, y)
        self.test_output_list += [{"loss": loss, "outputs": x_hat, "labels": y}]

    def on_test_epoch_start(self):
        super().on_test_epoch_start()
        self.test_output_list = []

    def on_test_epoch_end(self):
        avg_loss = torch.stack([x["loss"] for x in self.test_output_list]).mean()
        self.log("test/loss", avg_loss)

    def forward(self, batch):
        # because we are a wrapper, we call the inner function manually
        return self.model(batch)

trainer = pl.Trainer(
    max_epochs=4,
    accelerator="auto",
    log_every_n_steps=1,
    logger=False,
)

In [0]:
from configilm.ConfigILM import ILMConfiguration, ILMType
image_model_name = "resnet18"
text_model_name = "prajjwal1/bert-tiny"
number_of_channels = 12
image_size = 120
lr = 5e-4
seq_len = 32
classes = 25

model_config = ILMConfiguration(
    timm_model_name=image_model_name,
    hf_model_name=text_model_name,
    classes=classes,
    image_size=image_size,
    channels=number_of_channels,
    network_type=ILMType.VQA_CLASSIFICATION,
    max_sequence_length=seq_len,
)
model = LitVQAEncoder(config=model_config, lr=lr)

Now the model is trained using the `ThroughputTestDataModule` instead of any real data.

In [ ]:
from configilm.extra.DataModules import ThroughputTest_DataModule
dm = ThroughputTest_DataModule.ThroughputTestDataModule(
    data_dirs={},  # parameter is ignored but required for compatibility with other DataModules in ConfigILM
    img_size=(number_of_channels, image_size, image_size),
    seq_length=seq_len,
    num_samples=32*16,  # number of "samples" in this dataset -> each sample is the same one
    batch_size=32,
)

Now we train the model using this fake DataModule.

In [ ]:
trainer.fit(model, dm)

and we can also evaluate

In [ ]:
trainer.test(model, dm)