In [25]:
# Auto-reload module to access .py files easily
%load_ext autoreload
%autoreload 2

import os
import sys
import numpy as np
import torch
import torchvision

torchvision.disable_beta_transforms_warning()

import torchvision.transforms.v2 as tv_transforms
    
src_path = os.path.abspath("../src/")
if not src_path in sys.path:
    sys.path.append(src_path)

import transforms as my_transforms

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Currently differnt batch size yield different feature extraction results, which is not expected.
# First, investigate the equality of video inputs tensor with different batch size.
# Second, investigate the equality of feature extraction results with different batch size.
video_ex = {
    "id": "A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A",
    "path": "https://huggingface.co/datasets/jherng/xd-violence/resolve/main/data/video/1-1004/A.Beautiful.Mind.2001__%2300-01-45_00-02-50_label_A.mp4",
}

## 1. First, investigate the equality of video inputs tensor with different batch size.

In [3]:
# into tensor
video2batched = tv_transforms.Compose([
    my_transforms.AdaptDataFormat(id_key="id", path_key="path"),
    my_transforms.VideoReaderInit(io_backend="http"),
    my_transforms.TemporalClipSample(
        clip_len=32,
        sampling_rate=2,
        num_clips=-1,
    ),
    my_transforms.ClipBatching(batch_size=4),
    my_transforms.BatchDecodeIter(),
])

In [4]:
batched_clips = []
for i, batch in enumerate(video2batched(video_ex)):
    batched_clips.append(batch["inputs"])
batched_clips = torch.concatenate(batched_clips, axis=0)
batched_clips.shape

torch.Size([24, 32, 3, 346, 640])

In [5]:
# into tensor
video2full = tv_transforms.Compose([
    my_transforms.AdaptDataFormat(id_key="id", path_key="path"),
    my_transforms.VideoReaderInit(io_backend="http"),
    my_transforms.TemporalClipSample(
        clip_len=32,
        sampling_rate=2,
        num_clips=-1,
    ),
    my_transforms.VideoDecode()
])

In [6]:
full_clips = video2full(video_ex)["inputs"]
full_clips.shape

torch.Size([24, 32, 3, 346, 640])

In [7]:
torch.allclose(batched_clips, full_clips)

True

In [10]:
# into tensor
video2batched2 = tv_transforms.Compose([
    my_transforms.AdaptDataFormat(id_key="id", path_key="path"),
    my_transforms.VideoReaderInit(io_backend="http"),
    my_transforms.TemporalClipSample(
        clip_len=32,
        sampling_rate=2,
        num_clips=-1,
    ),
    my_transforms.ClipBatching(batch_size=8),
    my_transforms.BatchDecodeIter(),
])

In [11]:
batched_clips2 = []
for i, batch in enumerate(video2batched2(video_ex)):
    batched_clips2.append(batch["inputs"])
batched_clips2 = torch.concatenate(batched_clips2, axis=0)
batched_clips2.shape

torch.Size([24, 32, 3, 346, 640])

In [12]:
torch.allclose(batched_clips2, batched_clips)

True

In [13]:
clip_pipe = tv_transforms.Compose([
    my_transforms.Resize(size=256),
    my_transforms.FiveCrop(size=224),
    my_transforms.ToDType(dtype=torch.float32, scale=True),
    my_transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    my_transforms.ConvertTCHWToCTHW(lead_dims=2),
    my_transforms.PackInputs(preserved_meta=[]),
])

In [15]:
batched_clips = []
for i, batch in enumerate(video2batched(video_ex)):
    batch = clip_pipe(batch)
    batched_clips.append(batch["inputs"])
batched_clips = torch.concatenate(batched_clips, axis=0)
batched_clips.shape

torch.Size([24, 5, 3, 32, 224, 224])

In [16]:
batched_clips2 = []
for i, batch in enumerate(video2batched2(video_ex)):
    batch = clip_pipe(batch)
    batched_clips2.append(batch["inputs"])
batched_clips2 = torch.concatenate(batched_clips2, axis=0)
batched_clips2.shape

torch.Size([24, 5, 3, 32, 224, 224])

In [22]:
full_clips = video2full(video_ex)
full_clips = clip_pipe(full_clips)
full_clips = full_clips["inputs"]

In [17]:
torch.allclose(batched_clips, batched_clips2)

True

In [23]:
torch.allclose(batched_clips, full_clips)

True

In [24]:
torch.allclose(batched_clips2, full_clips)

True

Preprocessing pipeline is fine, different batch sizes or taking full video yield the same preprocessed clip tensors

## 2. Second, investigate the equality of feature extraction results with different batch size.

### I3D ResNet50

In [53]:
feat_bs2 = "C:/Users/Jia Herng/Documents/Jia Herng's Docs/Final Year Project/inappropriate-video-detection/feature-extractor/data/outputs/i3d_rgb/1-1004/A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A_bs2.npy"
feat_bs4 = "C:/Users/Jia Herng/Documents/Jia Herng's Docs/Final Year Project/inappropriate-video-detection/feature-extractor/data/outputs/i3d_rgb/1-1004/A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A_bs4.npy"
feat_full = "C:/Users/Jia Herng/Documents/Jia Herng's Docs/Final Year Project/inappropriate-video-detection/feature-extractor/data/outputs/i3d_rgb/1-1004/A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A_full.npy"

feat_bs2 = np.load(feat_bs2)
feat_bs4 = np.load(feat_bs4)
feat_full = np.load(feat_full)

In [54]:
feat_bs2.shape, feat_bs4.shape, feat_full.shape

((24, 5, 2048), (24, 5, 2048), (24, 5, 2048))

In [55]:
np.allclose(feat_bs2, feat_bs4) 

False

In [56]:
np.allclose(feat_bs2, feat_full) 

False

In [57]:
np.allclose(feat_bs4, feat_full) 

False

In [58]:
np.mean(feat_bs2, axis=(-1, 1))

array([0.19795631, 0.21573675, 0.21764307, 0.19901808, 0.17502332,
       0.15444714, 0.17563944, 0.16240397, 0.16609856, 0.14471725,
       0.14453971, 0.1539778 , 0.16903806, 0.16243568, 0.1976554 ,
       0.17051291, 0.13260107, 0.17505392, 0.17537233, 0.19241813,
       0.17840326, 0.16508189, 0.19380756, 0.15410993], dtype=float32)

In [59]:
np.mean(feat_bs4, axis=(-1, 1))

array([0.19796138, 0.21573432, 0.21764112, 0.19902506, 0.1750286 ,
       0.15444554, 0.17565452, 0.16239896, 0.1660938 , 0.1447154 ,
       0.14454141, 0.1539816 , 0.16903618, 0.16243584, 0.19766326,
       0.17051908, 0.1326073 , 0.17505176, 0.1753808 , 0.19242367,
       0.17840007, 0.16507807, 0.19379964, 0.1541156 ], dtype=float32)

In [60]:
np.mean(feat_full, axis=(-1, 1))

array([0.19797131, 0.21573214, 0.21764302, 0.199029  , 0.17502555,
       0.15444723, 0.17565158, 0.16239482, 0.16609915, 0.1447101 ,
       0.1445435 , 0.153988  , 0.16903202, 0.16242996, 0.19766541,
       0.17051853, 0.13261503, 0.17504542, 0.1753847 , 0.1924298 ,
       0.17839812, 0.16507958, 0.19380033, 0.15411422], dtype=float32)

### Video Swin Transformer

In [44]:
feat_bs2 = "C:/Users/Jia Herng/Documents/Jia Herng's Docs/Final Year Project/inappropriate-video-detection/feature-extractor/data/outputs/swin_rgb/1-1004/A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A_bs2.npy"
feat_bs4 = "C:/Users/Jia Herng/Documents/Jia Herng's Docs/Final Year Project/inappropriate-video-detection/feature-extractor/data/outputs/swin_rgb/1-1004/A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A_bs4.npy"
feat_full = "C:/Users/Jia Herng/Documents/Jia Herng's Docs/Final Year Project/inappropriate-video-detection/feature-extractor/data/outputs/swin_rgb/1-1004/A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A_full.npy"

feat_bs2 = np.load(feat_bs2)
feat_bs4 = np.load(feat_bs4)
feat_full = np.load(feat_full)

In [45]:
feat_bs2.shape, feat_bs4.shape, feat_full.shape

((24, 5, 768), (24, 5, 768), (24, 5, 768))

In [46]:
np.allclose(feat_bs2, feat_bs4) 

False

In [47]:
np.allclose(feat_bs2, feat_full) 

False

In [48]:
np.allclose(feat_bs4, feat_full) 

False

In [49]:
np.mean(feat_bs2, axis=(-1, 1))

array([-6.0272525e-04, -8.3369197e-04, -1.1103182e-03, -2.3060443e-03,
        5.6516490e-04,  1.6528585e-03,  9.5496030e-04,  1.0848107e-03,
        4.3409329e-04,  1.4218927e-03,  1.9406013e-03,  1.8147645e-03,
        1.7271914e-03,  1.6759173e-03, -6.1772135e-03, -5.9339139e-03,
        4.7014232e-04, -1.9616617e-03, -9.9494867e-04, -1.4083986e-03,
       -9.2717784e-04, -1.6547306e-03, -3.1557010e-04,  8.8795396e-06],
      dtype=float32)

In [50]:
np.mean(feat_bs4, axis=(-1, 1))

array([-6.0272490e-04, -8.3369302e-04, -1.1103185e-03, -2.3060446e-03,
        5.6516513e-04,  1.6528572e-03,  9.5496158e-04,  1.0848098e-03,
        4.3409417e-04,  1.4218924e-03,  1.9406017e-03,  1.8147627e-03,
        1.7271928e-03,  1.6759166e-03, -6.1772121e-03, -5.9339129e-03,
        4.7014357e-04, -1.9616624e-03, -9.9494832e-04, -1.4083990e-03,
       -9.2717691e-04, -1.6547312e-03, -3.1556984e-04,  8.8777706e-06],
      dtype=float32)

In [51]:
np.mean(feat_full, axis=(-1, 1))

array([-6.0272438e-04, -8.3369290e-04, -1.1103185e-03, -2.3060450e-03,
        5.6516536e-04,  1.6528580e-03,  9.5496140e-04,  1.0848118e-03,
        4.3409373e-04,  1.4218917e-03,  1.9406022e-03,  1.8147638e-03,
        1.7271922e-03,  1.6759170e-03, -6.1772135e-03, -5.9339125e-03,
        4.7014226e-04, -1.9616617e-03, -9.9494832e-04, -1.4084001e-03,
       -9.2717673e-04, -1.6547305e-03, -3.1556922e-04,  8.8783290e-06],
      dtype=float32)

batch normalization at inference time (divide by batch size?) 
- does not affect the slight different in computed feature values. Sample batch mean and sample batch variance are estimated at training time and being used at test time the same way
- hypothesis: the inherent randomness in the network itself, we can't expect it to produce the exact same feature even with the same input