REF: https://pytorchvideo.org/docs/tutorial_torchhub_inference

In [None]:
import torch
import json
from typing import Dict
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import CenterCropVideo, NormalizeVideo

from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import ApplyTransformToKey, ShortSideScale, UniformTemporalSubsample, UniformCropVideo

Load Pretrained model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# pretrained model
model_name = "slowfast_r50"
model = torch.hub.load("facebookresearch/pytorchvideo", model=model_name, pretrained=True)

model.to(device)
model.eval()

Get label

In [None]:
# uncommet to download data
# !wget https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json

In [None]:
with open("kinetics_classnames.json", "r") as f:
    class_names = json.load(f)

# map id to name mapping
id_to_name = {v: str(k).replace('"',"") for k, v in class_names.items()}
print(id_to_name)

Input transforms

Depend on model selection: 

- [SlowFast](https://pytorch.org/hub/facebookresearch_pytorchvideo_slowfast/)
- [X3D](https://pytorch.org/hub/facebookresearch_pytorchvideo_x3d/)
- [Slow](https://pytorch.org/hub/facebookresearch_pytorchvideo_resnet/)

In [None]:
# for slowfast
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
alpha = 4.0

class PackPathway(torch.nn.Module):
    """
    Transform for converting a video frames as a list of tensors
    """
    def __init__(self):
        super().__init__()

    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # perform temporal sampling from the fast pathway
        slow_pathway = torch.index_select(
            frames,
            1,                                                                                  # 1st dim
            torch.linspace(0, frames.shape[1] - 1, int(frames.shape[1] // alpha)).long(),       # indices, third param must be integer
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

transform = ApplyTransformToKey(
    key = "video",
    transform=Compose([
        UniformTemporalSubsample(num_frames),
        Lambda(lambda x: x / 255.0),
        NormalizeVideo(mean, std),
        ShortSideScale(size=side_size),
        CenterCropVideo(crop_size),
        PackPathway(), # from class ```PackPathway```
    ]),
)

# duration of input clip is also specific to the model
clip_duration = (num_frames * sampling_rate) / frames_per_second

Load video example

In [None]:
# Download the example video file, uncomment to download
# !wget https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4

In [None]:
# load example
video_path = "archery.mp4"

# select start and end time of the clip, start should correspond to where action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# initialize EncodedVideo
encoded_video = EncodedVideo.from_path(video_path)

# load clip
video_data = encoded_video.get_clip(start_sec=start_sec, end_sec=end_sec)

# apply transform
video_data = transform(video_data)

# move inputs to device
inputs = video_data["video"]
inputs = [i.to(device)[None, ...] for i in inputs]


Get Model Prediction

In [None]:
# Pass the input clip thorugh the model
logits = model(inputs)
print(logits.shape)

Inspect top 5 prediction

In [None]:
# Get predicted class
post_act = torch.nn.Softmax(dim=1)
prediction = post_act(logits)
pred_classes = prediction.topk(k=5).indices

# Map the predicted classes to the label names
pred_class_names = [id_to_name[i] for i in pred_classes.cpu().numpy()[0]]
print(pred_class_names)