In [1]:
import importlib
from typing import Set, Literal
from urllib.parse import urlsplit

import datasets
import torch
import torch.nn as nn
import torchvision
torchvision.disable_beta_transforms_warning()
import torchvision.transforms.v2 as tv_transforms
import torchvision.transforms.v2.functional as F

In [2]:
# Auto-reload module to access .py files easily
%load_ext autoreload
%autoreload 2

import os
import sys

src_path = os.path.abspath("../src/")
if not src_path in sys.path:
    sys.path.append(src_path)

import transforms as custom_transforms

In [3]:
def init_hf_dataset(
    hf_dataset_name: str, progress: Set[str]
) -> datasets.IterableDataset:
    """
    Initialize HuggingFace dataset (both train and test splits) and filter out videos that have already been processed.
    Note: Currently only supports streaming huggingface datasets but not non-streaming huggingface dataset.
    """

    if hf_dataset_name == "jherng/xd-violence":

        def extract_relative_dir(full_filepath: str):
            data_url = "/datasets/jherng/xd-violence/resolve/main/data/video"
            return "/".join(
                urlsplit(full_filepath)
                .path.split(data_url)[-1]
                .lstrip("/")
                .split("/")[:-1]  # relative_dir
            )

        train_ds = datasets.load_dataset(
            hf_dataset_name, name="video", split="train", streaming=True
        ).map(
            remove_columns=[
                "binary_target",
                "multilabel_targets",
                "frame_annotations",
            ]
        )  # Remove unused columns for preprocessing

        test_ds = datasets.load_dataset(
            hf_dataset_name, name="video", split="test", streaming=True
        ).map(
            remove_columns=[
                "binary_target",
                "multilabel_targets",
                "frame_annotations",
            ]
        )

        # Concatenate train and test datasets
        combined_ds = datasets.concatenate_datasets([train_ds, test_ds])

        # Filter out videos that have already been processed
        # assume there's always a subdir in the path at 2nd last position,
        # e.g., 1-1004 from https://huggingface.co/datasets/.../1-1004/A.Beautiful.Mind.2001__%2300-01-45_00-02-50_label_A.mp4
        combined_ds = combined_ds.filter(
            lambda x: "/".join([extract_relative_dir(x["path"]), x["id"]])
            not in progress
        )

    else:
        raise ValueError(
            f"Dataset {hf_dataset_name} not supported. Currently only supports ['jherng/xd-violence']."
        )

    return combined_ds, extract_relative_dir

In [4]:
hf_dataset, _ = init_hf_dataset("jherng/xd-violence", progress=set())
# preprocessing = build_preprocessing_pipeline(io_backend="http", num_clips=-1)

hf_dataset

<datasets.iterable_dataset.IterableDataset at 0x279c9cd7b20>

In [20]:
i3d = importlib.import_module(
    "models.i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb"
)

In [21]:
video2clips = i3d.build_video2clips_pipeline(
    batch_size=5,
    io_backend="http",
    id_key= "id",
    path_key="path",
    num_clips=-1,
)
clip_preprocessing = i3d.build_clip_pipeline(
    crop_type="5-crop"
)
print(video2clips)
print(clip_preprocessing)

Compose(
      AdaptDataFormat(id_key=id, path_key=path)
      VideoReaderInit(io_backend=http)
      TemporalClipSample(clip_len=32, num_clips=-1, sampling_rate=2)
      ClipsBatching(batch_size=5)
)
Compose(
      VideoDecode()
      Resize(size=256)
      FiveCrop(size=224)
      ToDType(dtype=torch.float32, scale=True)
      Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
      ConvertTCHWToCTHW(lead_dims=2)
      PackInputs(preserved_meta=['id', 'filename', 'batch_id'])
)


In [22]:
for i, video_ex in enumerate(hf_dataset):
    if i == 10:
        break
    print(i)
    batches = video2clips(video_ex)
    for batch_idx, batch in enumerate(batches):
        batch = clip_preprocessing(batch)
        print(batch_idx, batch["meta"], batch["inputs"].size())
    print()

0
0 {'id': 'A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A', 'filename': 'https://huggingface.co/datasets/jherng/xd-violence/resolve/main/data/video/1-1004/A.Beautiful.Mind.2001__%2300-01-45_00-02-50_label_A.mp4', 'batch_id': 0} torch.Size([5, 5, 3, 32, 224, 224])
1 {'id': 'A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A', 'filename': 'https://huggingface.co/datasets/jherng/xd-violence/resolve/main/data/video/1-1004/A.Beautiful.Mind.2001__%2300-01-45_00-02-50_label_A.mp4', 'batch_id': 1} torch.Size([5, 5, 3, 32, 224, 224])
2 {'id': 'A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A', 'filename': 'https://huggingface.co/datasets/jherng/xd-violence/resolve/main/data/video/1-1004/A.Beautiful.Mind.2001__%2300-01-45_00-02-50_label_A.mp4', 'batch_id': 2} torch.Size([5, 5, 3, 32, 224, 224])
3 {'id': 'A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A', 'filename': 'https://huggingface.co/datasets/jherng/xd-violence/resolve/main/data/video/1-1004/A.Beautiful.Mind.2001__%2300-01-45_00-02-50