In [1]:
!pip -q install decord==0.6.0
!apt-get -qq update
!apt-get -qq install -y unrar

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/13.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m7.9/13.6 MB[0m [31m236.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m13.6/13.6 MB[0m [31m230.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m130.5 MB/s[0m eta [36m0:00:00[0m
[?25hW: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [2]:
%%bash
set -u  # don't exit on error; we handle errors ourselves

ROOT="/content/ucf101"
mkdir -p "$ROOT"
cd "$ROOT"

echo "Installing tools..."
apt-get -qq update
apt-get -qq install -y curl unzip wget unrar > /dev/null

download_with_curl () {
  URL="$1"
  OUT="$2"
  echo "Downloading: $OUT"
  rm -f "$OUT.part" "$OUT"
  # -L follow redirects, --fail fail on HTTP errors, retry on transient issues
  curl -L --fail --retry 8 --retry-delay 2 --connect-timeout 20 --max-time 0 \
    -o "$OUT.part" "$URL"
  mv "$OUT.part" "$OUT"
  echo "Downloaded $(ls -lh "$OUT" | awk '{print $5}') -> $OUT"
  return 0
}

is_valid_zip () {
  FILE="$1"
  unzip -tq "$FILE" >/dev/null 2>&1
}

# ---------------------------
# 1) Try Hugging Face mirror
# ---------------------------
HF_VID="https://huggingface.co/datasets/quchenyuan/UCF101-ZIP/resolve/main/UCF-101.zip?download=true"
HF_SPL="https://huggingface.co/datasets/quchenyuan/UCF101-ZIP/resolve/main/UCF101TrainTestSplits-RecognitionTask.zip?download=true"

OK_VID=0
OK_SPL=0

echo "=== Attempt 1: HuggingFace ==="
download_with_curl "$HF_VID" "UCF-101.zip" && is_valid_zip "UCF-101.zip" && OK_VID=1 || OK_VID=0
download_with_curl "$HF_SPL" "splits.zip"  && is_valid_zip "splits.zip"  && OK_SPL=1 || OK_SPL=0

# If HF video zip is invalid, show what it actually is (often HTML)
if [ "$OK_VID" -ne 1 ] && [ -f "UCF-101.zip" ]; then
  echo "HF video file not a valid zip. 'file' says:"
  file "UCF-101.zip" || true
fi
if [ "$OK_SPL" -ne 1 ] && [ -f "splits.zip" ]; then
  echo "HF splits file not a valid zip. 'file' says:"
  file "splits.zip" || true
fi

# ---------------------------
# 2) Fallback to official CRCV
# ---------------------------
CRCV_VID="http://www.crcv.ucf.edu/data/UCF101/UCF101.rar"
CRCV_SPL="http://www.crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip"

if [ "$OK_VID" -ne 1 ]; then
  echo "=== Attempt 2: CRCV (official) video rar ==="
  rm -f UCF101.rar
  # wget tends to work on http CRCV
  wget -O UCF101.rar "$CRCV_VID"
  # quick sanity: list archive
  unrar t UCF101.rar >/dev/null 2>&1 && OK_VID=2 || OK_VID=0
fi

if [ "$OK_SPL" -ne 1 ]; then
  echo "=== Attempt 2: CRCV (official) splits zip ==="
  wget -O splits.zip "$CRCV_SPL"
  is_valid_zip "splits.zip" && OK_SPL=2 || OK_SPL=0
fi

# ---------------------------
# 3) Extract
# ---------------------------
mkdir -p videos
mkdir -p splits

if [ "$OK_VID" -eq 1 ]; then
  echo "Extracting HF zip..."
  unzip -q -o "UCF-101.zip" -d "videos"
elif [ "$OK_VID" -eq 2 ]; then
  echo "Extracting CRCV rar..."
  unrar x -idq -o+ "UCF101.rar" "videos/"
else
  echo "ERROR: Could not download a valid UCF101 video archive from HF or CRCV."
  echo "Try: Runtime -> Disconnect and delete runtime, then run again."
  exit 1
fi

if [ "$OK_SPL" -ge 1 ]; then
  echo "Extracting splits..."
  unzip -q -o "splits.zip" -d "splits"
else
  echo "ERROR: Could not download valid splits.zip."
  exit 1
fi

echo ""
echo "=== DONE extracting ==="
echo "Root: $ROOT"
echo "Listing key folders:"
ls -lah "$ROOT" | head -n 30

echo ""
echo "Counting videos (.avi):"
find "$ROOT/videos" -type f -name "*.avi" | wc -l

echo ""
echo "Split files:"
ls -lah "$ROOT/splits/ucfTrainTestlist" | head -n 20

Installing tools...
=== Attempt 1: HuggingFace ===
Downloading: UCF-101.zip
Downloaded 6.5G -> UCF-101.zip
Downloading: splits.zip
Downloaded 112K -> splits.zip
Extracting HF zip...
Extracting splits...

=== DONE extracting ===
Root: /content/ucf101
Listing key folders:
total 6.5G
drwxr-xr-x 4 root root 4.0K Jan 12 20:04 .
drwxr-xr-x 1 root root 4.0K Jan 12 20:01 ..
drwxr-xr-x 3 root root 4.0K Jan 12 20:05 splits
-rw-r--r-- 1 root root 112K Jan 12 20:04 splits.zip
-rw-r--r-- 1 root root 6.5G Jan 12 20:03 UCF-101.zip
drwxr-xr-x 3 root root 4.0K Jan 12 20:04 videos

Counting videos (.avi):
13320

Split files:
total 1.6M
drwxr-xr-x 2 root root 4.0K Jul 21  2013 .
drwxr-xr-x 3 root root 4.0K Jan 12 20:05 ..
-rw-r--r-- 1 root root 1.6K Jul  4  2013 classInd.txt
-rw-r--r-- 1 root root 143K Jul 21  2013 testlist01.txt
-rw-r--r-- 1 root root 141K Jul 21  2013 testlist02.txt
-rw-r--r-- 1 root root 140K Jul 21  2013 testlist03.txt
-rw-r--r-- 1 root root 386K Jul 21  2013 trainlist01.txt
-rw-r--r

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  1082  100  1082    0     0   3013      0 --:--:-- --:--:-- --:--:--  3022
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0 6635M    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0  0 6635M    0 22.4M    0     0  9283k      0  0:12:11  0:00:02  0:12:09 11.7M  1 6635M    1  116M    0     0  33.2M      0  0:03:19  0:00:03  0:03:16 39.6M  2 6635M    2  192M    0     0  42.9M      0  0:02:34  0:00:04  0:02:30 49.1M  4 6635M    4  307M    0     0  56.1M      0  0:01:58  0:00:05  0:01:53 62.5M  6 6635M    6  400

In [3]:
import os, glob

ROOT = "/content/ucf101"

# Splits dir (this should exist if extraction worked)
SPLITS_DIR = os.path.join(ROOT, "splits", "ucfTrainTestlist")
print("SPLITS_DIR exists:", os.path.isdir(SPLITS_DIR), SPLITS_DIR)

# Try common video roots
candidates = [
    os.path.join(ROOT, "videos", "UCF101"),
    os.path.join(ROOT, "videos", "UCF-101"),
    os.path.join(ROOT, "videos", "UCF101", "UCF101"),
    os.path.join(ROOT, "videos", "UCF-101", "UCF-101"),
    os.path.join(ROOT, "videos", "videos", "UCF101"),
    os.path.join(ROOT, "videos", "videos", "UCF-101"),
]

VIDEOS_ROOT = None
for p in candidates:
    if os.path.isdir(p) and len(glob.glob(os.path.join(p, "*", "*.avi"))) > 0:
        VIDEOS_ROOT = p
        break

print("VIDEOS_ROOT:", VIDEOS_ROOT)
print("Example videos:", glob.glob(os.path.join(VIDEOS_ROOT, "*", "*.avi"))[:3] if VIDEOS_ROOT else None)

SPLITS_DIR exists: True /content/ucf101/splits/ucfTrainTestlist
VIDEOS_ROOT: /content/ucf101/videos/UCF-101
Example videos: ['/content/ucf101/videos/UCF-101/JugglingBalls/v_JugglingBalls_g20_c03.avi', '/content/ucf101/videos/UCF-101/JugglingBalls/v_JugglingBalls_g14_c05.avi', '/content/ucf101/videos/UCF-101/JugglingBalls/v_JugglingBalls_g25_c02.avi']


In [4]:
import os, random, math, glob
import numpy as np
import torch

ROOT = "/content/ucf101"
SPLITS_DIR  = os.path.join(ROOT, "splits", "ucfTrainTestlist")

# Auto-detect VIDEOS_ROOT (works for different unzip/extract layouts)
candidates = [
    os.path.join(ROOT, "videos", "UCF101"),
    os.path.join(ROOT, "videos", "UCF-101"),
    os.path.join(ROOT, "videos", "UCF101", "UCF101"),
    os.path.join(ROOT, "videos", "UCF-101", "UCF-101"),
    os.path.join(ROOT, "videos", "videos", "UCF101"),
    os.path.join(ROOT, "videos", "videos", "UCF-101"),
]

VIDEOS_ROOT = None
for p in candidates:
    if os.path.isdir(p) and len(glob.glob(os.path.join(p, "*", "*.avi"))) > 0:
        VIDEOS_ROOT = p
        break

if VIDEOS_ROOT is None:
    raise FileNotFoundError("Could not find UCF101 videos folder. Check /content/ucf101/videos/")

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device, torch.cuda.get_device_name(0) if device == "cuda" else "")
print("VIDEOS_ROOT:", VIDEOS_ROOT)
print("SPLITS_DIR:", SPLITS_DIR)

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed_everything(42)

# --- Choose ONE preset ---
PRESET = "STRONG"   # "FAST" or "STRONG"

if PRESET == "FAST":
    CFG = dict(
        frames=16, stride=2, size=112,
        batch=64, workers=4,
        epochs=60, lr=0.2, warmup_epochs=5,
        weight_decay=1e-4, label_smoothing=0.1,
        grad_accum=1,
        nclips_eval=10
    )
else:  # STRONG
    CFG = dict(
        frames=32, stride=2, size=160,
        batch=24, workers=4,
        epochs=100, lr=0.12, warmup_epochs=5,
        weight_decay=1e-4, label_smoothing=0.1,
        grad_accum=1,   # set to 2 if you hit OOM
        nclips_eval=10
    )

CFG

Device: cuda NVIDIA A100-SXM4-80GB
VIDEOS_ROOT: /content/ucf101/videos/UCF-101
SPLITS_DIR: /content/ucf101/splits/ucfTrainTestlist


{'frames': 32,
 'stride': 2,
 'size': 160,
 'batch': 24,
 'workers': 4,
 'epochs': 100,
 'lr': 0.12,
 'warmup_epochs': 5,
 'weight_decay': 0.0001,
 'label_smoothing': 0.1,
 'grad_accum': 1,
 'nclips_eval': 10}

In [5]:
from typing import Dict, List, Tuple

def read_class_index(path: str) -> Dict[str, int]:
    # classInd.txt: "1 ApplyEyeMakeup"
    mapping = {}
    with open(path, "r") as f:
        for line in f:
            idx, name = line.strip().split()
            mapping[name] = int(idx) - 1
    return mapping

CLASS_TO_IDX = read_class_index(os.path.join(SPLITS_DIR, "classInd.txt"))
IDX_TO_CLASS = {v:k for k,v in CLASS_TO_IDX.items()}

def parse_train_list(path: str) -> List[Tuple[str,int]]:
    # trainlist01.txt: "ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01.avi 1"
    items = []
    with open(path, "r") as f:
        for line in f:
            rel, cls_idx_1based = line.strip().split()
            cls = int(cls_idx_1based) - 1
            items.append((rel, cls))
    return items

def parse_test_list(path: str) -> List[Tuple[str,int]]:
    # testlist01.txt: "ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01.avi"
    items = []
    with open(path, "r") as f:
        for line in f:
            rel = line.strip()
            cls_name = rel.split("/")[0]
            cls = CLASS_TO_IDX[cls_name]
            items.append((rel, cls))
    return items

train_items = parse_train_list(os.path.join(SPLITS_DIR, "trainlist01.txt"))
test_items  = parse_test_list(os.path.join(SPLITS_DIR, "testlist01.txt"))

print("Train videos:", len(train_items), "Test videos:", len(test_items))
print("Example:", train_items[0])

Train videos: 9537 Test videos: 3783
Example: ('ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi', 0)


In [6]:
# --- Dataset + Dataloader block (corrected normalization broadcasting) ---

from dataclasses import dataclass
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from decord import VideoReader, cpu

@dataclass
class ClipCfg:
    frames: int
    stride: int
    size: int
    min_scale: float = 0.6
    max_scale: float = 1.0
    hflip_p: float = 0.5

# IMPORTANT: these shapes are for x shaped (T, C, H, W)
MEAN = torch.tensor([0.5, 0.5, 0.5]).view(1, 3, 1, 1)
STD  = torch.tensor([0.5, 0.5, 0.5]).view(1, 3, 1, 1)

def load_clip(video_path: str, frames: int, stride: int, start: int) -> np.ndarray:
    """
    Returns: (T, H, W, 3) uint8
    """
    vr = VideoReader(video_path, ctx=cpu(0))
    n = len(vr)
    span = (frames - 1) * stride + 1

    if n <= 0:
        raise RuntimeError(f"Empty video: {video_path}")

    if n >= span:
        start = min(max(start, 0), n - span)
        idxs = [start + i * stride for i in range(frames)]
    else:
        idxs = list(range(0, n, stride))
        if len(idxs) == 0:
            idxs = [0]
        while len(idxs) < frames:
            idxs.append(idxs[-1])
        idxs = idxs[:frames]

    return vr.get_batch(idxs).asnumpy()

def clip_augment(frames_np: np.ndarray, cfg: ClipCfg, train: bool) -> torch.Tensor:
    """
    frames_np: (T, H, W, 3) uint8
    returns:   (C, T, cfg.size, cfg.size) float32 normalized
    """
    x = torch.from_numpy(frames_np).permute(0, 3, 1, 2).float() / 255.0  # (T,C,H,W)

    if train:
        # Consistent random resized crop across the whole clip
        T, C, H, W = x.shape
        scale = random.uniform(cfg.min_scale, cfg.max_scale)
        new_h = max(int(H * scale), cfg.size)
        new_w = max(int(W * scale), cfg.size)

        top = random.randint(0, max(0, H - new_h))
        left = random.randint(0, max(0, W - new_w))

        x = x[:, :, top:top+new_h, left:left+new_w]
        x = F.interpolate(x, size=(cfg.size, cfg.size), mode="bilinear", align_corners=False)

        if random.random() < cfg.hflip_p:
            x = torch.flip(x, dims=[3])  # flip width
    else:
        x = F.interpolate(x, size=(cfg.size, cfg.size), mode="bilinear", align_corners=False)

    # ✅ Normalize while x is (T,C,H,W) so MEAN/STD broadcast correctly
    x = (x - MEAN) / STD

    # Return (C,T,H,W)
    x = x.permute(1, 0, 2, 3).contiguous()
    return x

class UCF101Clips(Dataset):
    def __init__(self, items, videos_root, clip_cfg: ClipCfg, train: bool):
        self.items = items
        self.videos_root = videos_root
        self.cfg = clip_cfg
        self.train = train

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        rel, label = self.items[idx]
        path = os.path.join(self.videos_root, rel)
        if not os.path.exists(path):
            raise FileNotFoundError(f"Missing video: {path}")

        start = random.randint(0, 10_000_000) if self.train else 0
        frames = load_clip(path, self.cfg.frames, self.cfg.stride, start)
        clip = clip_augment(frames, self.cfg, train=self.train)
        return clip, label

clip_cfg = ClipCfg(frames=CFG["frames"], stride=CFG["stride"], size=CFG["size"])

train_ds = UCF101Clips(train_items, VIDEOS_ROOT, clip_cfg, train=True)
test_ds  = UCF101Clips(test_items,  VIDEOS_ROOT, clip_cfg, train=False)

train_loader = DataLoader(
    train_ds,
    batch_size=CFG["batch"],
    shuffle=True,
    num_workers=CFG["workers"],
    pin_memory=True,
    persistent_workers=(CFG["workers"] > 0),
)
test_loader = DataLoader(
    test_ds,
    batch_size=CFG["batch"],
    shuffle=False,
    num_workers=CFG["workers"],
    pin_memory=True,
    persistent_workers=(CFG["workers"] > 0),
)

# Sanity check
batch = next(iter(train_loader))
print("Clip batch shape:", batch[0].shape)  # (B,C,T,H,W)
print("Label batch shape:", torch.tensor(batch[1]).shape)

Clip batch shape: torch.Size([24, 3, 32, 160, 160])
Label batch shape: torch.Size([24])


  print("Label batch shape:", torch.tensor(batch[1]).shape)


In [7]:
import torch.nn as nn
import torch.nn.functional as F

class R2Plus1DBlock(nn.Module):
    def __init__(self, in_ch, out_ch, stride=1):
        super().__init__()
        self.spatial = nn.Conv3d(in_ch, out_ch, kernel_size=(1,3,3),
                                 stride=(1,stride,stride), padding=(0,1,1), bias=False)
        self.bn1 = nn.BatchNorm3d(out_ch)

        self.temporal = nn.Conv3d(out_ch, out_ch, kernel_size=(3,1,1),
                                  stride=(1,1,1), padding=(1,0,0), bias=False)
        self.bn2 = nn.BatchNorm3d(out_ch)

        self.down = None
        if in_ch != out_ch or stride != 1:
            self.down = nn.Sequential(
                nn.Conv3d(in_ch, out_ch, kernel_size=1, stride=(1,stride,stride), bias=False),
                nn.BatchNorm3d(out_ch)
            )

    def forward(self, x):
        identity = x
        x = F.relu(self.bn1(self.spatial(x)))
        x = self.bn2(self.temporal(x))
        if self.down is not None:
            identity = self.down(identity)
        return F.relu(x + identity)

class R2Plus1DNet(nn.Module):
    def __init__(self, num_classes=101):
        super().__init__()
        self.stem = nn.Sequential(
            nn.Conv3d(3, 64, kernel_size=(3,7,7), stride=(1,2,2), padding=(1,3,3), bias=False),
            nn.BatchNorm3d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)),
        )
        self.layer1 = nn.Sequential(R2Plus1DBlock(64, 64),  R2Plus1DBlock(64, 64))
        self.layer2 = nn.Sequential(R2Plus1DBlock(64, 128, stride=2), R2Plus1DBlock(128, 128))
        self.layer3 = nn.Sequential(R2Plus1DBlock(128, 256, stride=2), R2Plus1DBlock(256, 256))
        self.layer4 = nn.Sequential(R2Plus1DBlock(256, 512, stride=2), R2Plus1DBlock(512, 512))
        self.head = nn.Sequential(
            nn.AdaptiveAvgPool3d((1,1,1)),
            nn.Flatten(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )
        self._init()

    def _init(self):
        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, nn.BatchNorm3d):
                nn.init.ones_(m.weight); nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01); nn.init.zeros_(m.bias)

    def forward(self, x):
        x = self.stem(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return self.head(x)

model = R2Plus1DNet(num_classes=101).to(device)
print("Params (M):", sum(p.numel() for p in model.parameters())/1e6)

# Optional speed-up on A100 (PyTorch 2.x)
try:
    model = torch.compile(model)
    print("torch.compile enabled")
except Exception as e:
    print("torch.compile not enabled:", e)

Params (M): 7.069221
torch.compile enabled


In [8]:
from time import time
from torch.cuda.amp import autocast, GradScaler

def topk_acc(logits, targets, ks=(1,5)):
    with torch.no_grad():
        maxk = max(ks)
        pred = logits.topk(maxk, dim=1).indices
        correct = pred.eq(targets.view(-1,1))
        out = []
        for k in ks:
            out.append(correct[:, :k].any(dim=1).float().mean().item())
        return out

@torch.no_grad()
def eval_clip(model, loader):
    model.eval()
    s1=s5=n=0
    for x,y in loader:
        x = x.to(device, non_blocking=True)
        y = torch.as_tensor(y, device=device)
        logits = model(x)
        a1,a5 = topk_acc(logits, y, ks=(1,5))
        b = x.size(0)
        s1 += a1*b; s5 += a5*b; n += b
    return s1/n, s5/n

def train_one_epoch(model, loader, opt, scaler, epoch, total_epochs):
    model.train()
    t0=time()
    total_loss=0.0
    step=0

    for x,y in loader:
        x = x.to(device, non_blocking=True)
        y = torch.as_tensor(y, device=device)

        with autocast():
            logits = model(x)
            loss = F.cross_entropy(logits, y, label_smoothing=CFG["label_smoothing"])
            loss = loss / CFG["grad_accum"]

        scaler.scale(loss).backward()

        step += 1
        if step % CFG["grad_accum"] == 0:
            scaler.step(opt)
            scaler.update()
            opt.zero_grad(set_to_none=True)

        total_loss += loss.item() * x.size(0) * CFG["grad_accum"]

    dt=time()-t0
    return total_loss/len(loader.dataset), dt

# Optimizer
opt = torch.optim.SGD(model.parameters(), lr=CFG["lr"], momentum=0.9, weight_decay=CFG["weight_decay"])
scaler = GradScaler()

# LR schedule: warmup then cosine
def lr_at_epoch(ep):
    if ep < CFG["warmup_epochs"]:
        return (ep + 1) / CFG["warmup_epochs"]
    progress = (ep - CFG["warmup_epochs"]) / max(1, (CFG["epochs"] - CFG["warmup_epochs"]))
    return 0.5 * (1 + math.cos(math.pi * progress))

best_vtop1 = 0.0
ckpt_path = "/content/r2plus1d_ucf101_best.pt"

for ep in range(CFG["epochs"]):
    # set LR
    lr_scale = lr_at_epoch(ep)
    for pg in opt.param_groups:
        pg["lr"] = CFG["lr"] * lr_scale

    loss, dt = train_one_epoch(model, train_loader, opt, scaler, ep, CFG["epochs"])
    top1, top5 = eval_clip(model, test_loader)

    if top1 > best_vtop1:
        best_vtop1 = top1
        torch.save({"model": model.state_dict(), "epoch": ep+1, "clip_top1": top1}, ckpt_path)

    print(f"Ep {ep+1:03d}/{CFG['epochs']} | lr {opt.param_groups[0]['lr']:.5f} | loss {loss:.4f} | "
          f"TEST clip Top1 {top1*100:.2f}% Top5 {top5*100:.2f}% | {dt:.1f}s | best {best_vtop1*100:.2f}%")

  scaler = GradScaler()
  with autocast():
  return torch._C._get_cublas_allow_tf32()
  with autocast():


Ep 001/100 | lr 0.02400 | loss 4.4383 | TEST clip Top1 7.67% Top5 23.26% | 279.2s | best 7.67%


  with autocast():


Ep 002/100 | lr 0.04800 | loss 3.9967 | TEST clip Top1 12.40% Top5 34.02% | 219.0s | best 12.40%
Ep 003/100 | lr 0.07200 | loss 3.7082 | TEST clip Top1 17.05% Top5 42.51% | 219.1s | best 17.05%
Ep 004/100 | lr 0.09600 | loss 3.4826 | TEST clip Top1 17.84% Top5 44.83% | 219.6s | best 17.84%
Ep 005/100 | lr 0.12000 | loss 3.2671 | TEST clip Top1 22.31% Top5 52.05% | 218.6s | best 22.31%
Ep 006/100 | lr 0.12000 | loss 3.0062 | TEST clip Top1 26.67% Top5 56.60% | 218.9s | best 26.67%
Ep 007/100 | lr 0.11997 | loss 2.7622 | TEST clip Top1 29.39% Top5 60.40% | 218.6s | best 29.39%
Ep 008/100 | lr 0.11987 | loss 2.5578 | TEST clip Top1 30.27% Top5 59.93% | 219.2s | best 30.27%
Ep 009/100 | lr 0.11970 | loss 2.3826 | TEST clip Top1 31.64% Top5 63.28% | 218.5s | best 31.64%
Ep 010/100 | lr 0.11948 | loss 2.2304 | TEST clip Top1 29.76% Top5 60.61% | 218.7s | best 31.64%
Ep 011/100 | lr 0.11918 | loss 2.1095 | TEST clip Top1 34.13% Top5 62.65% | 219.8s | best 34.13%
Ep 012/100 | lr 0.11882 | loss

In [9]:
@torch.no_grad()
def predict_video_logits(model, video_path, cfg: ClipCfg, n_clips=10):
    model.eval()
    vr = VideoReader(video_path, ctx=cpu(0))
    n = len(vr)
    span = (cfg.frames - 1) * cfg.stride + 1

    if n <= span:
        starts = [0]*n_clips
    else:
        max_start = n - span
        starts = np.linspace(0, max_start, num=n_clips).astype(int).tolist()

    logits_all = []
    for s in starts:
        frames = load_clip(video_path, cfg.frames, cfg.stride, s)
        clip = clip_augment(frames, cfg, train=False).unsqueeze(0).to(device)
        logits_all.append(model(clip))
    return torch.mean(torch.cat(logits_all, dim=0), dim=0, keepdim=True)

@torch.no_grad()
def eval_video_level(model, items, videos_root, cfg: ClipCfg, n_clips=10):
    model.eval()
    c1=c5=0
    for rel, y in items:
        path = os.path.join(videos_root, rel)
        logits = predict_video_logits(model, path, cfg, n_clips=n_clips)
        top5 = logits.topk(5, dim=1).indices.squeeze(0).tolist()
        c1 += int(top5[0] == y)
        c5 += int(y in top5)
    return c1/len(items), c5/len(items)

# Load best and run video-level eval
ckpt = torch.load(ckpt_path, map_location=device)
model.load_state_dict(ckpt["model"])

v1, v5 = eval_video_level(model, test_items, VIDEOS_ROOT, clip_cfg, n_clips=CFG["nclips_eval"])
print(f"Video-level ({CFG['nclips_eval']} clips/video): Top1 {v1*100:.2f}% | Top5 {v5*100:.2f}%")

Video-level (10 clips/video): Top1 59.13% | Top5 82.24%


In [10]:
@torch.no_grad()
def demo_one():
    rel, y = random.choice(test_items)
    path = os.path.join(VIDEOS_ROOT, rel)
    logits = predict_video_logits(model, path, clip_cfg, n_clips=CFG["nclips_eval"])
    probs = torch.softmax(logits, dim=1).squeeze(0)
    vals, idxs = torch.topk(probs, k=5)

    print("Video:", rel)
    print("GT:", IDX_TO_CLASS[y])
    print("Top-5:")
    for p,i in zip(vals.tolist(), idxs.tolist()):
        print(f"  {IDX_TO_CLASS[i]:25s}  {p:.3f}")

demo_one()

Video: PullUps/v_PullUps_g01_c01.avi
GT: PullUps
Top-5:
  Mixing                     0.095
  PlayingTabla               0.059
  CleanAndJerk               0.045
  PlayingViolin              0.042
  BlowDryHair                0.039
