<a href="https://colab.research.google.com/github/karthik111/video_anomaly_detection/blob/master/notebooks/facebookresearch_pytorchvideo_x3d.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# X3D

*Author: FAIR PyTorchVideo*

**X3D networks pretrained on the Kinetics 400 dataset**


### Example Usage

#### Imports

Load the model:

In [4]:
pip install -U git+https://github.com/facebookresearch/fvcore.git

Collecting git+https://github.com/facebookresearch/fvcore.git
  Cloning https://github.com/facebookresearch/fvcore.git to /tmp/pip-req-build-k87gvghm
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/fvcore.git /tmp/pip-req-build-k87gvghm
  Resolved https://github.com/facebookresearch/fvcore.git to commit e8d19df2ffdaf0a78c8d88d6a2522c36b0cacb07
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yacs>=0.1.6 (from fvcore==0.1.6)
  Downloading yacs-0.1.8-py3-none-any.whl (14 kB)
Collecting iopath>=0.1.7 (from fvcore==0.1.6)
  Downloading iopath-0.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m834.7 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting portalocker (from iopath>=0.1.7->fvcore==0.1.6)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Building wheels for collected packages: fvcore, iopath
  Building wheel for fv

In [24]:
import torch
# Choose the `x3d_s` model
model_name = 'x3d_s'
model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True)

Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main


Import remaining functions:

In [18]:
#model_name = "i3d_r50"
#model = torch.hub.load("facebookresearch/pytorchvideo", model=model_name, pretrained=True)

Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main
Downloading: "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/I3D_8x8_R50.pyth" to /root/.cache/torch/hub/checkpoints/I3D_8x8_R50.pyth
100%|██████████| 214M/214M [00:00<00:00, 229MB/s]


In [9]:
!pip install av

Collecting av
  Downloading av-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av
Successfully installed av-11.0.0


In [10]:
import json
import urllib
from pytorchvideo.data.encoded_video import EncodedVideo

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)



#### Setup

Set the model to eval mode and move to desired device.

In [11]:
# Set to GPU or CPU
device = "cpu"
model = model.eval()
model = model.to(device)

Download the id to label mapping for the Kinetics 400 dataset on which the torch hub models were trained. This will be used to get the category label names from the predicted class ids.

In [12]:
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)

In [13]:
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

#### Define input transform

In [14]:
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
frames_per_second = 30
model_transform_params  = {
    "x3d_xs": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 4,
        "sampling_rate": 12,
    },
    "x3d_s": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 13,
        "sampling_rate": 6,
    },
    "x3d_m": {
        "side_size": 256,
        "crop_size": 256,
        "num_frames": 16,
        "sampling_rate": 5,
    }
}

# Get transform parameters based on model
transform_params = model_transform_params[model_name]

# Note that this transform is specific to the slow_R50 model.
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(transform_params["num_frames"]),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(size=transform_params["side_size"]),
            CenterCropVideo(
                crop_size=(transform_params["crop_size"], transform_params["crop_size"])
            )
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (transform_params["num_frames"] * transform_params["sampling_rate"])/frames_per_second

#### Run Inference

Download an example video.

In [15]:
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)

Load the video and transform it to the input format required by the model.

In [16]:
# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path)

# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

# Apply a transform to normalize the video input
video_data = transform(video_data)

# Move the inputs to the desired device
inputs = video_data["video"]
inputs = inputs.to(device)

#### Get Predictions

In [21]:
# Pass the input clip through the model
preds = model(inputs[None, ...])

# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

Top 5 predicted labels: archery, golf driving, opening bottle, golf chipping, throwing axe


### Model Description
X3D model architectures are based on [1] pretrained on the Kinetics dataset.

| arch | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) |
| --------------- | ----------- | ----------- | ----------- | ----------- | ----------- |  ----------- | ----------- |
| X3D      | XS    | 4x12                       | 69.12 | 88.63 | 0.91      | 3.79     |
| X3D      | S     | 13x6                       | 73.33 | 91.27 | 2.96      | 3.79     |
| X3D      | M     | 16x5                       | 75.94 | 92.72 | 6.72      | 3.79     |


### References
[1] Christoph Feichtenhofer, "X3D: Expanding Architectures for
    Efficient Video Recognition." https://arxiv.org/abs/2004.04730

In [75]:
import torch

# Define a hook function to capture the output of a specific layer
def hook_fn(module, input, output):
    # Store the output tensor in a global variable
    global output_from_5th_layer
    global input_to_6th_layer
    output_from_5th_layer = output
    input_to_6th_layer = input

# Assuming 'model' is your defined network
#model = model.double()

# Register the hook to the desired layer (5th layer in this case)
target_layer = 5  # Index of the 6th layer
hook = model.blocks[target_layer].register_forward_hook(hook_fn)

# Set the model to evaluation mode
model.eval()

# Pass the input through the network
output = model(inputs[None, ...])

# The output_from_5th_layer variable now contains the features from the 5th layer
# You can use this tensor for further analysis or processing

# Remove the hook to avoid potential memory leaks
hook.remove()

In [35]:
output_from_5th_layer.shape

torch.Size([1, 192, 13, 6, 6])

In [82]:
input_to_6th_layer[0].shape

torch.Size([1, 192, 13, 6, 6])

In [28]:
192*13*6*6

89856

In [36]:
for name, layer in model._modules.items():
  print(f"Name: {name}, Layer: {layer}")

Name: blocks, Layer: ModuleList(
  (0): ResNetBasicStem(
    (conv): Conv2plus1d(
      (conv_t): Conv3d(3, 24, kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), bias=False)
      (conv_xy): Conv3d(24, 24, kernel_size=(5, 1, 1), stride=(1, 1, 1), padding=(2, 0, 0), groups=24, bias=False)
    )
    (norm): BatchNorm3d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (activation): ReLU()
  )
  (1): ResStage(
    (res_blocks): ModuleList(
      (0): ResBlock(
        (branch1_conv): Conv3d(24, 24, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False)
        (branch2): BottleneckBlock(
          (conv_a): Conv3d(24, 54, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
          (norm_a): BatchNorm3d(54, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act_a): ReLU()
          (conv_b): Conv3d(54, 54, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1), groups=54, bias=False)
          (norm_b): Sequential(
            (0

In [85]:
block5 = model.blocks[5]


In [86]:
modules = block5.named_modules()

In [87]:
for name, module in modules:
  print(f"Name: {name}, Module: {module}")
  if name == 'proj':
    print(f'adding hook to Name: {name}, Module: {module}')
    module.register_forward_hook(hook_fn)

Name: , Module: ResNetBasicHead(
  (pool): ProjectedPool(
    (pre_conv): Conv3d(192, 432, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
    (pre_norm): BatchNorm3d(432, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (pre_act): ReLU()
    (pool): AvgPool3d(kernel_size=(13, 5, 5), stride=1, padding=0)
    (post_conv): Conv3d(432, 2048, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
    (post_act): ReLU()
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (proj): Linear(in_features=2048, out_features=400, bias=True)
  (output_pool): AdaptiveAvgPool3d(output_size=1)
)
Name: pool, Module: ProjectedPool(
  (pre_conv): Conv3d(192, 432, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
  (pre_norm): BatchNorm3d(432, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pre_act): ReLU()
  (pool): AvgPool3d(kernel_size=(13, 5, 5), stride=1, padding=0)
  (post_conv): Conv3d(432, 2048, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
  (post_act

In [99]:
# Set the model to evaluation mode
model.eval()

# Pass the input through the network
output = model(inputs[None, ...])

# The output_from_5th_layer variable now contains the features from the 5th layer
# You can use this tensor for further analysis or processing

# Remove the hook to avoid potential memory leaks
hook.remove()

In [100]:
output_from_5th_layer.shape

torch.Size([1, 1, 2, 2, 400])

In [105]:
input_to_6th_layer[0].shape

torch.Size([1, 1, 2, 2, 2048])

In [104]:
import torch.nn as nn
# Define AdaptiveAvgPool3d layer with output size [1, 1, 1]
adaptive_pool = nn.AdaptiveAvgPool3d(output_size=(2048))

# Apply adaptive pooling to the input tensor
output = adaptive_pool(input_to_6th_layer[0])

# Output shape after adaptive average pooling
print(output.shape)

torch.Size([1, 1, 2048, 2048, 2048])
