Defines the VideoClipDataset and ConvLSTM model, trains it on clip-level data with train/val loaders, saves per-epoch checkpoints, selects convlstm_best.pth, and evaluates test accuracy and speed plus confusion matrix / precision / recall / F1.

In [1]:
import os
import json
from collections import Counter

import torch

PROJECT_ROOT = "/home/olzhas/programming/traffic-accident-edge"
DATA_ROOT = os.path.join(PROJECT_ROOT, "TAD-benchmark")
CLIPS_INDEX_PATH = os.path.join(DATA_ROOT, "clips_index.json")

print("DATA_ROOT exists:", os.path.exists(DATA_ROOT))
print("CLIPS_INDEX_PATH exists:", os.path.exists(CLIPS_INDEX_PATH))

with open(CLIPS_INDEX_PATH, "r") as f:
    clips = json.load(f)

print("Total clips:", len(clips))

split_counts = Counter(c["split"] for c in clips)
label_counts = Counter(c["label"] for c in clips)

print("By split:", split_counts)
print("By label:", label_counts)
print("Sample clip:", clips[0])

train_clips = [c for c in clips if c["split"] == "train"]
val_clips = [c for c in clips if c["split"] == "val"]
test_clips = [c for c in clips if c["split"] == "test"]

print(f"train_clips: {len(train_clips)}")
print(f"val_clips:   {len(val_clips)}")
print(f"test_clips:  {len(test_clips)}")

print("Train label counts:", Counter(c["label"] for c in train_clips))
print("Val   label counts:", Counter(c["label"] for c in val_clips))
print("Test  label counts:", Counter(c["label"] for c in test_clips))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

DATA_ROOT exists: True
CLIPS_INDEX_PATH exists: True
Total clips: 22037
By split: Counter({'train': 16020, 'val': 3903, 'test': 2114})
By label: Counter({1: 16075, 0: 5962})
Sample clip: {'video_path': '/home/olzhas/programming/traffic-accident-edge/TAD-benchmark/train/accident_1/videox3_10.mp4', 'frame_indices': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'label': 1, 'split': 'train'}
train_clips: 16020
val_clips:   3903
test_clips:  2114
Train label counts: Counter({1: 12071, 0: 3949})
Val   label counts: Counter({1: 2959, 0: 944})
Test  label counts: Counter({0: 1069, 1: 1045})
Using device: cuda
GPU name: NVIDIA GeForce RTX 2080 Ti


In [2]:
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T


CLIP_LEN = 16
IMG_SIZE = 112

frame_transform = T.Compose([
    T.ToPILImage(),
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.ToTensor(),
    T.Normalize(mean=[0.5, 0.5, 0.5],
                std=[0.5, 0.5, 0.5])
])


class VideoClipDataset(Dataset):
    """
    Dataset that returns (clip_tensor, label) for each entry in clips list.
    clip_tensor: shape [T, C, H, W] where T = number of frames (e.g. 16)
    label: 0 (normal) or 1 (accident)
    """

    def __init__(self, clips_list, transform=None):
        self.clips = clips_list
        self.transform = transform

    def __len__(self):
        return len(self.clips)

    def __getitem__(self, idx):
        clip_info = self.clips[idx]
        video_path = clip_info["video_path"]
        frame_indices = clip_info["frame_indices"]
        label = clip_info["label"]

        cap = cv2.VideoCapture(video_path)
        frames = []

        for fi in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, fi)
            ret, frame = cap.read()
            if not ret or frame is None:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            if self.transform is not None:
                frame = self.transform(frame)
            else:
                frame = torch.from_numpy(frame).permute(
                    2, 0, 1).float() / 255.0
            frames.append(frame)

        cap.release()

        if len(frames) == 0:
            clip_tensor = torch.zeros(CLIP_LEN, 3, IMG_SIZE, IMG_SIZE)
        else:
            while len(frames) < CLIP_LEN:
                frames.append(frames[-1].clone())
            clip_tensor = torch.stack(frames[:CLIP_LEN], dim=0)

        label_tensor = torch.tensor(label, dtype=torch.long)
        return clip_tensor, label_tensor


train_dataset = VideoClipDataset(train_clips, transform=frame_transform)
val_dataset = VideoClipDataset(val_clips,   transform=frame_transform)
test_dataset = VideoClipDataset(test_clips,  transform=frame_transform)


sample_clip, sample_label = train_dataset[0]

print("Sample clip shape:", sample_clip.shape)
print("Sample label:", sample_label)

Sample clip shape: torch.Size([16, 3, 112, 112])
Sample label: tensor(1)


In [3]:
from torch.utils.data import DataLoader

BATCH_SIZE = 8

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)


batch_clips, batch_labels = next(iter(train_loader))
print("Batch clips shape:", batch_clips.shape)   # [B, 16, 3, 112, 112]
print("Batch labels shape:", batch_labels.shape)
print("Batch labels sample:", batch_labels[:8])

Batch clips shape: torch.Size([8, 16, 3, 112, 112])
Batch labels shape: torch.Size([8])
Batch labels sample: tensor([1, 0, 1, 1, 1, 0, 0, 1])


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class ConvLSTMCell(nn.Module):
    """
    Basic ConvLSTM cell.
    Inputs:
      x_t:  [B, C_in, H, W]
      h_t:  [B, C_hidden, H, W]
      c_t:  [B, C_hidden, H, W]
    """

    def __init__(self, input_dim, hidden_dim, kernel_size=3):
        super().__init__()
        padding = kernel_size // 2

        self.conv = nn.Conv2d(
            in_channels=input_dim + hidden_dim,
            out_channels=4 * hidden_dim,
            kernel_size=kernel_size,
            padding=padding
        )
        self.hidden_dim = hidden_dim

    def forward(self, x, h_prev, c_prev):
        combined = torch.cat([x, h_prev], dim=1)
        conv_out = self.conv(combined)

        cc_i, cc_f, cc_o, cc_g = torch.split(conv_out, self.hidden_dim, dim=1)

        i = torch.sigmoid(cc_i)
        f = torch.sigmoid(cc_f)
        o = torch.sigmoid(cc_o)
        g = torch.tanh(cc_g)

        c = f * c_prev + i * g
        h = o * torch.tanh(c)
        return h, c


class SimpleCNNEncoder(nn.Module):
    """
    Small CNN to get spatial features from each frame.
    Input:  [B, 3, H, W]
    Output: [B, C_feat, H_feat, W_feat]
    """

    def __init__(self, in_channels=3, base_channels=32):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, base_channels,
                               kernel_size=3, stride=2, padding=1)
        self.bn1 = nn.BatchNorm2d(base_channels)
        self.conv2 = nn.Conv2d(
            base_channels, base_channels * 2, kernel_size=3, stride=2, padding=1)
        self.bn2 = nn.BatchNorm2d(base_channels * 2)
        self.conv3 = nn.Conv2d(
            base_channels * 2, base_channels * 4, kernel_size=3, stride=2, padding=1)
        self.bn3 = nn.BatchNorm2d(base_channels * 4)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.relu(self.bn3(self.conv3(x)))
        return x


class ConvLSTMClassifier(nn.Module):
    """
    Full model: frame-wise CNN encoder + ConvLSTM over time + classifier.
    Input:  [B, T, C, H, W]
    Output: logits [B, 2] (accident vs normal)
    """

    def __init__(self, img_channels=3, base_channels=32, hidden_dim=128, num_classes=2):
        super().__init__()
        self.encoder = SimpleCNNEncoder(
            in_channels=img_channels, base_channels=base_channels)

        feat_channels = base_channels * 4
        self.convlstm_cell = ConvLSTMCell(
            input_dim=feat_channels, hidden_dim=hidden_dim, kernel_size=3)

        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        B, T, C, H, W = x.shape

        x = x.view(B * T, C, H, W)
        feats = self.encoder(x)

        _, C_feat, H_feat, W_feat = feats.shape
        feats = feats.view(B, T, C_feat, H_feat, W_feat)

        h = torch.zeros(B, self.convlstm_cell.hidden_dim,
                        H_feat, W_feat, device=feats.device)
        c = torch.zeros_like(h)

        for t in range(T):
            x_t = feats[:, t]
            h, c = self.convlstm_cell(x_t, h, c)

        h_pooled = F.adaptive_avg_pool2d(
            h, (1, 1)).view(B, -1)

        logits = self.fc(h_pooled)
        return logits


model = ConvLSTMClassifier(
    img_channels=3, base_channels=32, hidden_dim=128, num_classes=2)
model = model.to(device)

print("Model params:", sum(p.numel() for p in model.parameters()) / 1e6, "M")


batch_clips, batch_labels = next(iter(train_loader))
batch_clips = batch_clips.to(device)

with torch.no_grad():
    logits = model(batch_clips)

print("Input batch shape:", batch_clips.shape)
print("Logits shape:", logits.shape)  # [B, 2]

Model params: 1.274114 M
Input batch shape: torch.Size([8, 16, 3, 112, 112])
Logits shape: torch.Size([8, 2])


In [5]:
import os
import torch
import torch.nn as nn
from collections import Counter

train_counts = Counter(c["label"] for c in train_clips)
print("Train label counts:", train_counts)

total = train_counts[0] + train_counts[1]
w0 = total / (2 * train_counts[0])
w1 = total / (2 * train_counts[1])

class_weights = torch.tensor([w0, w1], device=device, dtype=torch.float32)
print("Class weights (0=normal, 1=accident):", class_weights)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
NUM_EPOCHS = 5

ckpt_dir = os.path.join(PROJECT_ROOT, "checkpoints_convlstm")
os.makedirs(ckpt_dir, exist_ok=True)
print("Checkpoints will be saved to:", ckpt_dir)


def train_one_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    for batch_idx, (clips, labels) in enumerate(loader):
        clips = clips.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(clips)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * clips.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += clips.size(0)

        if batch_idx % 100 == 0:
            print(f"[train] batch {batch_idx}/{len(loader)} "
                  f"loss={loss.item():.4f}")

    avg_loss = total_loss / total
    acc = correct / total
    return avg_loss, acc


def evaluate(model, loader, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for clips, labels in loader:
            clips = clips.to(device)
            labels = labels.to(device)

            logits = model(clips)
            loss = criterion(logits, labels)

            total_loss += loss.item() * clips.size(0)
            preds = logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += clips.size(0)

    avg_loss = total_loss / total
    acc = correct / total
    return avg_loss, acc


best_val_acc = 0.0
best_state = None

for epoch in range(1, NUM_EPOCHS + 1):
    print(f"\n=== Epoch {epoch}/{NUM_EPOCHS} ===")
    train_loss, train_acc = train_one_epoch(
        model, train_loader, optimizer, device)
    val_loss, val_acc = evaluate(model, val_loader, device)

    print(f"Epoch {epoch:02d}: "
          f"train_loss={train_loss:.4f}, train_acc={train_acc:.4f}, "
          f"val_loss={val_loss:.4f}, val_acc={val_acc:.4f}")

    epoch_state = {
        "model": model.state_dict(),
        "epoch": epoch,
        "val_acc": val_acc,
    }
    epoch_ckpt_path = os.path.join(
        ckpt_dir,
        f"epoch_{epoch:02d}_val_{val_acc:.3f}.pth"
    )
    torch.save(epoch_state, epoch_ckpt_path)
    print("  -> Saved epoch checkpoint to:", epoch_ckpt_path)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_state = epoch_state
        print("  -> New best model (val_acc improved)")

print(f"\nBest val_acc: {best_val_acc:.4f} at epoch {best_state['epoch']}")

Train label counts: Counter({1: 12071, 0: 3949})
Class weights (0=normal, 1=accident): tensor([2.0284, 0.6636], device='cuda:0')
Checkpoints will be saved to: /home/olzhas/programming/traffic-accident-edge/checkpoints_convlstm

=== Epoch 1/5 ===
[train] batch 0/2003 loss=0.6726
[train] batch 100/2003 loss=0.5319
[train] batch 200/2003 loss=0.4213
[train] batch 300/2003 loss=0.4229
[train] batch 400/2003 loss=0.4668
[train] batch 500/2003 loss=0.2866
[train] batch 600/2003 loss=0.5107
[train] batch 700/2003 loss=0.0703
[train] batch 800/2003 loss=0.2772
[train] batch 900/2003 loss=0.4377
[train] batch 1000/2003 loss=0.0613
[train] batch 1100/2003 loss=0.5804
[train] batch 1200/2003 loss=0.1105
[train] batch 1300/2003 loss=0.4247
[train] batch 1400/2003 loss=0.3598
[train] batch 1500/2003 loss=0.3019
[train] batch 1600/2003 loss=0.1175
[train] batch 1700/2003 loss=0.1818
[train] batch 1800/2003 loss=0.0177
[train] batch 1900/2003 loss=0.1025
[train] batch 2000/2003 loss=0.2492
Epoch 01: 

In [6]:
import os
import torch

best_path = os.path.join(PROJECT_ROOT, "convlstm_best.pth")
torch.save(best_state, best_path)
print("Saved best ConvLSTM model to:", best_path)
print("Best epoch:", best_state["epoch"],
      "with val_acc:", best_state["val_acc"])

Saved best ConvLSTM model to: /home/olzhas/programming/traffic-accident-edge/convlstm_best.pth
Best epoch: 2 with val_acc: 0.8252626184985908


In [7]:
import torch

model_test = ConvLSTMClassifier(
    img_channels=3,
    base_channels=32,
    hidden_dim=128,
    num_classes=2
).to(device)

best_path = os.path.join(PROJECT_ROOT, "convlstm_best.pth")
ckpt = torch.load(best_path, map_location=device)
model_test.load_state_dict(ckpt["model"])

print("Loaded ConvLSTM model from:", best_path)
print("Best epoch:", ckpt["epoch"], "val_acc:", ckpt["val_acc"])

test_loss, test_acc = evaluate(model_test, test_loader, device)
print(f"TEST: loss={test_loss:.4f}, acc={test_acc:.4f}")

Loaded ConvLSTM model from: /home/olzhas/programming/traffic-accident-edge/convlstm_best.pth
Best epoch: 2 val_acc: 0.8252626184985908
TEST: loss=1.3024, acc=0.7649


In [9]:
import os
import torch

best_path = os.path.join(PROJECT_ROOT, "convlstm_best.pth")

print("ConvLSTM checkpoint exists:", os.path.exists(best_path))
if os.path.exists(best_path):
    size_mb = os.path.getsize(best_path) / (1024 * 1024)
    print(f"ConvLSTM checkpoint size: {size_mb:.2f} MB")

ConvLSTM checkpoint exists: True
ConvLSTM checkpoint size: 4.87 MB


In [10]:
import time
import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model_test = ConvLSTMClassifier(
    img_channels=3,
    base_channels=32,
    hidden_dim=128,
    num_classes=2
).to(device)

ckpt = torch.load(best_path, map_location=device)
model_test.load_state_dict(ckpt["model"])
model_test.eval()

print("Loaded ConvLSTM best epoch:",
      ckpt["epoch"], "val_acc:", ckpt["val_acc"])

total = 0
correct = 0

if device.type == "cuda":
    torch.cuda.synchronize()
start_time = time.time()

with torch.no_grad():
    for clips, labels in test_loader:
        clips = clips.to(device)
        labels = labels.to(device)

        logits = model_test(clips)
        preds = logits.argmax(dim=1)

        correct += (preds == labels).sum().item()
        total += labels.size(0)

if device.type == "cuda":
    torch.cuda.synchronize()
end_time = time.time()

total_time = end_time - start_time
avg_time_per_clip = total_time / max(total, 1)
clips_per_sec = 1.0 / avg_time_per_clip if avg_time_per_clip > 0 else 0.0


frames_per_sec = clips_per_sec * 16

test_acc = correct / total if total > 0 else 0.0

print(f"\nConvLSTM TEST accuracy: {test_acc:.4f}")
print(f"Total clips: {total}")
print(f"Total time: {total_time:.2f} s")
print(f"Avg time per clip: {avg_time_per_clip*1000:.2f} ms")
print(f"Clips per second: {clips_per_sec:.2f}")
print(f"Approx frames per second (16 frames/clip): {frames_per_sec:.2f}")

Using device: cuda
Loaded ConvLSTM best epoch: 2 val_acc: 0.8252626184985908

ConvLSTM TEST accuracy: 0.7649
Total clips: 2114
Total time: 1199.34 s
Avg time per clip: 567.33 ms
Clips per second: 1.76
Approx frames per second (16 frames/clip): 28.20


In [12]:
import numpy as np
from collections import Counter
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

model_test.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for clips, labels in test_loader:
        clips = clips.to(device)
        labels = labels.to(device)

        logits = model_test(clips)
        preds = logits.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy().tolist())
        all_labels.extend(labels.cpu().numpy().tolist())

y_true = np.array(all_labels)
y_pred = np.array(all_preds)

cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
print("Confusion matrix (rows=true, cols=pred) [accident, normal]:\n", cm)

prec, rec, f1, _ = precision_recall_fscore_support(
    y_true, y_pred, labels=[1, 0], zero_division=0
)

print(
    f"Accident class: precision={prec[0]:.3f}, recall={rec[0]:.3f}, f1={f1[0]:.3f}")
print(
    f"Normal   class: precision={prec[1]:.3f}, recall={rec[1]:.3f}, f1={f1[1]:.3f}")

test_counts = Counter(c["label"] for c in test_clips)
majority_label = max(test_counts, key=test_counts.get)
baseline_acc = test_counts[majority_label] / (test_counts[0] + test_counts[1])

print("\nTest label counts:", test_counts)
print("Majority baseline accuracy:", baseline_acc)

Confusion matrix (rows=true, cols=pred) [accident, normal]:
 [[973  72]
 [425 644]]
Accident class: precision=0.696, recall=0.931, f1=0.797
Normal   class: precision=0.899, recall=0.602, f1=0.722

Test label counts: Counter({0: 1069, 1: 1045})
Majority baseline accuracy: 0.5056764427625354
