In [2]:
import lightning as L
from lightning.pytorch.loggers import CSVLogger
from key_utils import KeyDataModule, KeyClf, label2id, id2label
from transformers import VivitImageProcessor, VivitForVideoClassification
import torch

image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")

def preprocess(frames): 
    out = image_processor(list(frames), return_tensors="pt")
    pixel_values = out['pixel_values'][0]
    return pixel_values

class ViVitKeyClf(KeyClf):
    def __init__(self, weights, num_classes=len(id2label), learning_rate=0.01):
        super().__init__(weights, num_classes, learning_rate)
        self.model = VivitForVideoClassification.from_pretrained(
            "google/vivit-b-16x2-kinetics400", 
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True,
            num_frames=5,
        )

    def common_step(self, batch):
        videos, targets = batch
        out = self.model(videos)
        preds = out.logits
        loss = self.loss_fn(preds, targets.long())
        pred_ids = torch.argmax(preds, dim=1)
        return loss, pred_ids

dm = KeyDataModule(
    labels_dir='datasets/labels', 
    videos_dir='datasets/raw_frames',
    color_channel_last=True,
    preprocess=preprocess,
    num_workers=0,
    batch_size=2,
)

module = ViVitKeyClf(weights=dm.train_weights)
logger = CSVLogger("logs", name=f"vivit")
trainer = L.Trainer(
    # deterministic=True,
    devices=[0, 1],
    accelerator="gpu",
    fast_dev_run=False,
    logger=logger,
    max_epochs=100
)

trainer.fit(module, dm)
trainer.test(module, dm)

Some weights of VivitForVideoClassification were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized because the shapes did not match:
- vivit.embeddings.position_embeddings: found shape torch.Size([1, 3137, 768]) in the checkpoint and torch.Size([1, 393, 768]) in the model instantiated
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([31, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([31]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name     | Type                        | Params | Mode 
-----------------------------------------------------------------
0 | loss_fn  | CrossEntropyLoss            | 0      | train
1 | accu

Train: 77589; Val: 7168;
Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/haily/.pyenv/versions/3.10.4/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/Users/haily/.pyenv/versions/3.10.4/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0:   0%|          | 36/38795 [00:26<7:50:35,  1.37it/s, v_num=5]

ValueError: Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.