In [1]:
import os
import math
import glob
import numpy as np
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models

In [2]:
def load_flow_stack(u_dir, v_dir, idx, stack_len=5, name_pattern="frame_{:05d}.jpg"):
    """
    Return numpy array of shape (10, H, W) -> 5 u followed by 5 v (grayscale)
    idx is the *frame index* (zero-based) corresponding to the last frame in the stack.
    """
    u_imgs = []
    v_imgs = []
    for i in range(idx - stack_len + 1, idx + 1):
        fname = name_pattern.format(i)
        u_path = os.path.join(u_dir, fname)
        v_path = os.path.join(v_dir, fname)
        if not os.path.exists(u_path) or not os.path.exists(v_path):
            return None
        u_img = cv2.imread(u_path, cv2.IMREAD_GRAYSCALE)
        v_img = cv2.imread(v_path, cv2.IMREAD_GRAYSCALE)
        if u_img is None or v_img is None:
            return None
        # ensure (H,W)
        u_imgs.append(u_img[np.newaxis, ...])  # (1,H,W)
        v_imgs.append(v_img[np.newaxis, ...])
    stack = np.concatenate(u_imgs + v_imgs, axis=0).astype(np.float32)  # (10,H,W)
    return stack


In [3]:
class FlowFeatureExtractor(nn.Module):
    def __init__(self, in_channels=10, out_dim=1024, backbone="resnet50"):
        super().__init__()
        self.in_channels = in_channels
        self.out_dim = out_dim
        self.backbone_name = backbone

        if backbone == "resnet50":
            # robust fallback: ResNet50 + project
            net = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
            # adapt first conv to in_channels
            old_conv = net.conv1
            new_conv = nn.Conv2d(in_channels, old_conv.out_channels,
                                 kernel_size=old_conv.kernel_size,
                                 stride=old_conv.stride,
                                 padding=old_conv.padding,
                                 bias=(old_conv.bias is not None))
            # init new_conv weights by copying RGB weights and repeating/averaging
            with torch.no_grad():
                if in_channels >= 3:
                    new_conv.weight[:, :3, :, :] = old_conv.weight
                    # extra channels: copy average of RGB
                    if in_channels > 3:
                        mean_rgb = old_conv.weight.mean(dim=1, keepdim=True)  # (out,1,k,k)
                        for c in range(3, in_channels):
                            new_conv.weight[:, c:c+1, :, :] = mean_rgb
                else:
                    # unlikely: in_channels < 3
                    new_conv.weight[:, :in_channels, :, :] = old_conv.weight[:, :in_channels, :, :]
            net.conv1 = new_conv

            # remove classification head: use global avgpool output (2048)
            net.fc = nn.Identity()  # so forward returns (B,2048) after avgpool
            feat_dim = 2048
            self.backbone = net
            self.proj = nn.Linear(feat_dim, out_dim)
        else:
            raise ValueError("Backbone '%s' not implemented in this script" % backbone)

        # freeze backbone (paper: features fixed)
        for p in self.backbone.parameters():
            p.requires_grad = False
        # allow proj to be trainable? Paper keeps extractor fixed; projection can be part of feature extraction
        for p in self.proj.parameters():
            p.requires_grad = False  # keep projection fixed too for strict adherence
            # if you want to fine-tune proj set True

    def forward(self, x):
        # x: (B, in_channels, H, W) float, normalized already
        b = x.size(0)
        feat = self.backbone(x)          # (B, feat_dim)
        out = self.proj(feat)            # (B, out_dim)
        return out

# -----------------------------------
# Extract flow features for one video (save .npy)
# -----------------------------------
def extract_flow_features_for_video(u_dir, v_dir, out_path,
                                    extractor: FlowFeatureExtractor,
                                    stack_len=5, stride=5,
                                    resize=(256,456), device="cpu",
                                    name_pattern="frame_{:05d}.jpg",
                                    verbose=True):
    """
    Walk frames in u_dir/v_dir and extract features. Save to out_path (.npy)
    - resize is (H, W) = (256,456) as per paper (H=256, W=456).
    - stride is sampling of last stack frame index (if sample_rate=5 the npy index ~ frame//5).
    """
    files = sorted(os.listdir(u_dir))
    n_frames = len(files)
    feats = []
    extractor = extractor.to(device).eval()

    # precompute mean/std as tensors
    mean = torch.tensor([0.485] * extractor.in_channels, device=device).view(1, extractor.in_channels, 1, 1)
    std  = torch.tensor([0.229] * extractor.in_channels, device=device).view(1, extractor.in_channels, 1, 1)

    for idx in range(stack_len - 1, n_frames, stride):
        stack = load_flow_stack(u_dir, v_dir, idx, stack_len=stack_len, name_pattern=name_pattern)
        if stack is None:
            if verbose:
                # missing frames at edges
                pass
            continue
        # stack: (10, H_orig, W_orig)
        t = torch.from_numpy(stack).unsqueeze(0).to(device)  # (1,10,H,W)
        t = t.float() / 255.0
        # resize to desired (H,W)
        t = F.interpolate(t, size=resize, mode="bilinear", align_corners=False)
        # normalize
        t = (t - mean) / std

        with torch.no_grad():
            f = extractor(t)  # (1, out_dim)
        feats.append(f.cpu().numpy().reshape(-1))

    feats = np.stack(feats, axis=0) if len(feats) > 0 else np.zeros((0, extractor.out_dim), dtype=np.float32)
    np.save(out_path, feats)
    if verbose:
        print(f"Saved features {feats.shape} -> {out_path}")
    return feats


In [4]:
class EpicFlowFeatureDataset(Dataset):
    def __init__(self, annots, feature_dir, context=21, stride_on_features=1, label_key="action_class"):
        """
        annots: list of dicts with keys 'video_id', 'start_frame', ... (start_frame in raw video frame units)
        feature_dir: folder containing per-video files like P01_01_flow.npy
        context: number of snippets (features) to use (e.g. 21)
        stride_on_features: if features were sampled at sample_rate s, then feature_index = frame // s.
                            If your .npy already indexes sampled frames, set stride_on_features=1.
        label_key: which annotation field to use ("action_class"/"verb_class"/"noun_class")
        """
        self.samples = []
        self.feature_dir = feature_dir
        self.context = context
        self.stride_on_features = stride_on_features
        self.label_key = label_key

        for a in annots:
            vid = a["video_id"]
            start = int(a["start_frame"])
            label = int(a[self.label_key])
            feat_path = os.path.join(feature_dir, f"{vid}_flow.npy")
            if not os.path.exists(feat_path):
                continue
            self.samples.append({
                "vid": vid, "start_frame": start, "feat_path": feat_path, "label": label
            })

        # cache loaded feature arrays
        self._cache = {}

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        s = self.samples[idx]
        vid = s["vid"]
        if vid not in self._cache:
            self._cache[vid] = np.load(s["feat_path"])   # (T_feats, D)
        feats = self._cache[vid]
        feat_index = s["start_frame"] // self.stride_on_features
        feat_index = min(feat_index, len(feats))  # clamp
        start_idx = max(0, feat_index - self.context)
        seq = feats[start_idx:feat_index]   # (k, D)
        if seq.shape[0] < self.context:
            pad = np.zeros((self.context - seq.shape[0], feats.shape[1]), dtype=np.float32)
            seq = np.vstack([pad, seq])
        return torch.from_numpy(seq).float(), torch.tensor(s["label"]).long()


In [5]:
class ResidualTemporalBlock(nn.Module):
    def __init__(self, C, kernel=3, dilation=1, p_drop=0.3):
        super().__init__()
        pad = (kernel - 1) * dilation
        self.conv = nn.Conv1d(C, C, kernel, padding=pad, dilation=dilation)
        self.bn = nn.BatchNorm1d(C)
        self.drop = nn.Dropout(p_drop)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x):
        # x: (B, C, T)
        out = self.conv(x)
        out = self.bn(out)
        out = self.drop(out)
        # trim right padding to keep length: conv with padding=(k-1)*d returns len + pad, so slice last T
        out = out[..., :x.size(2)]
        return self.relu(out + x)

class UniFlowTCN(nn.Module):
    def __init__(self, in_dim=1024, hidden=1024, n_layers=4, kernel=3,
                 n_action=200, n_verb=None, n_noun=None, p_drop=0.3):
        super().__init__()
        self.input_proj = nn.Conv1d(in_dim, hidden, kernel_size=1)
        self.blocks = nn.ModuleList([
            ResidualTemporalBlock(hidden, kernel=kernel, dilation=(i+1), p_drop=p_drop)
            for i in range(n_layers)
        ])
        # heads (global pooling over time)
        self.head_action = nn.Sequential(nn.AdaptiveAvgPool1d(1), nn.Flatten(), nn.Dropout(0.7), nn.Linear(hidden, n_action))
        # optional verb/noun
        self.head_verb = None
        self.head_noun = None
        if n_verb:
            self.head_verb = nn.Sequential(nn.AdaptiveAvgPool1d(1), nn.Flatten(), nn.Dropout(0.7), nn.Linear(hidden, n_verb))
        if n_noun:
            self.head_noun = nn.Sequential(nn.AdaptiveAvgPool1d(1), nn.Flatten(), nn.Dropout(0.7), nn.Linear(hidden, n_noun))

    def forward(self, x):
        # x: (B, T, D)
        x = x.permute(0, 2, 1)  # (B, D, T)
        x = self.input_proj(x)  # (B, hidden, T)
        for b in self.blocks:
            x = b(x)
        outA = self.head_action(x)
        outV, outN = None, None
        if self.head_verb:
            outV = self.head_verb(x)
        if self.head_noun:
            outN = self.head_noun(x)
        return outA, outV, outN


In [36]:
def train_unimodal_flow(annots, feature_root, num_actions,
                        context=21, batch_size=16, epochs=30,
                        lr=5e-4, weight_decay=5e-4, stride_on_features=1,
                        device=None):
    device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
    ds = EpicFlowFeatureDataset(annots, feature_root, context=context, stride_on_features=stride_on_features, label_key="action_class")
    loader = DataLoader(ds, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
    model = UniFlowTCN(in_dim=1024, hidden=1024, n_layers=4, n_action=num_actions).to(device)
    opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
    ce = nn.CrossEntropyLoss()

    for ep in range(1, epochs+1):
        model.train()
        total, correct, loss_sum = 0.0, 0, 0.0
        for X, y in loader:
            X = X.to(device)   # (B,T,1024)
            y = y.to(device)
            outA,_,_ = model(X)
            loss = ce(outA, y)
            opt.zero_grad()
            loss.backward()
            opt.step()

            loss_sum += loss.item()
            preds = outA.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)
        print(f"Epoch {ep}/{epochs} loss={loss_sum/len(loader):.4f} acc={correct/total:.4f}")
    return model

In [9]:
u_dir=r"C:\Users\Alis\Desktop\Minor Project\OpticalFlow\P01_01\u"
v_dir=r"C:\Users\Alis\Desktop\Minor Project\OpticalFlow\P01_01\v"
extractor = FlowFeatureExtractor(in_channels=10, out_dim=1024, backbone="resnet50")
extract_flow_features_for_video(u_dir, v_dir, out_path="P01_01_flow.npy", extractor=extractor,
                                stack_len=5, stride=5, resize=(256,456), device="cuda")

Saved features (9913, 1024) -> P01_01_flow.npy


array([[ 0.36699796, -0.09396664,  0.04978663, ..., -0.01747663,
         0.22542918,  0.10597148],
       [ 0.39778498, -0.13228619,  0.09689356, ..., -0.04566773,
         0.23897809,  0.05014143],
       [ 0.39031318, -0.14413048,  0.01582648, ..., -0.01113382,
         0.29390094,  0.02070261],
       ...,
       [ 0.39688268,  0.046829  ,  0.06360796, ..., -0.4679838 ,
         0.02355651, -0.36303392],
       [ 0.5092193 , -0.05146424,  0.13493755, ..., -0.3841746 ,
         0.09903391, -0.4390804 ],
       [ 0.1496426 ,  0.09781548,  0.19651009, ..., -0.14331797,
         0.16697614, -0.20242082]], dtype=float32)

In [17]:
u_dir=r"C:\Users\Alis\Desktop\Minor Project\OpticalFlow\P01_02\u"
v_dir=r"C:\Users\Alis\Desktop\Minor Project\OpticalFlow\P01_02\v"
extractor = FlowFeatureExtractor(in_channels=10, out_dim=1024, backbone="resnet50")
extract_flow_features_for_video(u_dir, v_dir, out_path="P01_02_flow.npy", extractor=extractor,
                                stack_len=5, stride=5, resize=(256,456), device="cuda")

Saved features (3013, 1024) -> P01_02_flow.npy


array([[ 0.15611236, -0.10960457, -0.0683035 , ...,  0.22482777,
        -0.3398625 ,  0.14191732],
       [ 0.2840967 , -0.05636885, -0.19736274, ...,  0.26885965,
        -0.37536326,  0.30609125],
       [ 0.05536053, -0.05508311, -0.07395775, ...,  0.24293973,
        -0.25257096,  0.12840156],
       ...,
       [ 0.08066492, -0.20348191, -0.16416739, ...,  0.29985347,
        -0.19633481,  0.04107007],
       [ 0.03719174, -0.10905169,  0.00930319, ...,  0.12232547,
        -0.01758666,  0.05853077],
       [ 0.12534976, -0.18267441, -0.14506426, ...,  0.07100389,
        -0.20469187,  0.18305536]], dtype=float32)

In [18]:
u_dir=r"C:\Users\Alis\Desktop\Minor Project\OpticalFlow\P01_03\u"
v_dir=r"C:\Users\Alis\Desktop\Minor Project\OpticalFlow\P01_03\v"
extractor = FlowFeatureExtractor(in_channels=10, out_dim=1024, backbone="resnet50")
extract_flow_features_for_video(u_dir, v_dir, out_path="P01_03_flow.npy", extractor=extractor,
                                stack_len=5, stride=5, resize=(256,456), device="cuda")

Saved features (713, 1024) -> P01_03_flow.npy


array([[ 0.05461931,  0.14494097, -0.3226608 , ...,  0.09361984,
        -0.10034194,  0.19786872],
       [ 0.15957429,  0.21602915, -0.34196556, ...,  0.12058246,
        -0.2081184 ,  0.14731336],
       [ 0.12522033,  0.15050147, -0.27202994, ...,  0.16045268,
        -0.16295923,  0.32625225],
       ...,
       [ 0.127799  , -0.13002308, -0.16407773, ...,  0.0600653 ,
        -0.1542764 ,  0.2600136 ],
       [ 0.14310129,  0.06494825,  0.0255594 , ..., -0.08432195,
        -0.13418433,  0.16121985],
       [ 0.07974368,  0.0647414 , -0.16003482, ..., -0.07082683,
        -0.08923995,  0.32279852]], dtype=float32)

In [19]:
u_dir=r"C:\Users\Alis\Desktop\Minor Project\OpticalFlow\P01_04\u"
v_dir=r"C:\Users\Alis\Desktop\Minor Project\OpticalFlow\P01_04\v"
extractor = FlowFeatureExtractor(in_channels=10, out_dim=1024, backbone="resnet50")
extract_flow_features_for_video(u_dir, v_dir, out_path="P01_04_flow.npy", extractor=extractor,
                                stack_len=5, stride=5, resize=(256,456), device="cuda")

Saved features (631, 1024) -> P01_04_flow.npy


array([[ 0.19723895, -0.24409574,  0.01781794, ...,  0.21769232,
         0.01159016,  0.20624839],
       [ 0.15788816, -0.09464243,  0.04870874, ...,  0.26431003,
        -0.07470681,  0.20641541],
       [ 0.21049494, -0.14488025,  0.2837513 , ...,  0.23482522,
        -0.16149665,  0.3392455 ],
       ...,
       [ 0.13354185, -0.23806041, -0.09902406, ...,  0.09224624,
         0.0520287 ,  0.32973006],
       [ 0.16996531, -0.31401446, -0.02806909, ...,  0.07524844,
         0.08955118,  0.36377543],
       [ 0.17643633, -0.23866682, -0.00228741, ...,  0.17315897,
        -0.00053515,  0.15044698]], dtype=float32)

In [20]:
u_dir=r"C:\Users\Alis\Desktop\Minor Project\OpticalFlow\P01_05\u"
v_dir=r"C:\Users\Alis\Desktop\Minor Project\OpticalFlow\P01_05\v"
extractor = FlowFeatureExtractor(in_channels=10, out_dim=1024, backbone="resnet50")
extract_flow_features_for_video(u_dir, v_dir, out_path="P01_05_flow.npy", extractor=extractor,
                                stack_len=5, stride=5, resize=(256,456), device="cuda")

Saved features (7632, 1024) -> P01_05_flow.npy


array([[ 0.2694452 ,  0.3494546 , -0.00386948, ..., -0.1387239 ,
        -0.39857176, -0.24953668],
       [ 0.28278184,  0.09898116,  0.17627755, ..., -0.2428058 ,
        -0.41330972, -0.1611986 ],
       [ 0.2520699 ,  0.11901359,  0.06085365, ..., -0.21500331,
        -0.45165324, -0.32585177],
       ...,
       [ 0.23219942,  0.12484025,  0.00754249, ..., -0.25584665,
        -0.3017678 , -0.41445315],
       [ 0.02810715,  0.03071283, -0.09522419, ..., -0.25188228,
        -0.32147995, -0.2561596 ],
       [ 0.01017908,  0.0276692 , -0.04355564, ..., -0.24570741,
        -0.24765125, -0.2583301 ]], dtype=float32)

In [8]:
import os
import pandas as pd


In [27]:
label_dir = r"C:\Users\Alis\Desktop\Minor Project\Label"  # adjust path if needed

all_annots = []

for fname in os.listdir(label_dir):
    if fname.endswith(".csv"):
        video_id = fname.replace(".csv", "")  # e.g., P01_01
        df = pd.read_csv(os.path.join(label_dir, fname))

        # add video_id column
        df["video_id"] = video_id

        all_annots.append(df)

# merge into one dataframe
df_all = pd.concat(all_annots, ignore_index=True)
print(df_all.head())


   StartFrame  EndFrame     Verb  Verb_class    Noun  Noun_class  \
0           8       202     open           2    door           8   
1         262       370  turn-on          12   light         113   
2         418       569    close           3    door           8   
3         766       839     open           2  fridge          10   
4         915       983     take           0  celery         185   

   Action_class     ActionName video_id  
0             0      open door   P01_01  
1             1  turn-on light   P01_01  
2             2     close door   P01_01  
3             3    open fridge   P01_01  
4             4    take celery   P01_01  


In [37]:
annots = []

for _, row in df_all.iterrows():
    annots.append({
        "video_id": row["video_id"],
        "start_frame": int(row["StartFrame"]),
        "action_class": int(row["Action_class"]),
        "verb_class": int(row["Verb_class"]),
        "noun_class": int(row["Noun_class"]),
    })

num_actions = df_all["Action_class"].nunique()

print("Total annotations:", len(annots))
print("Number of unique actions:", num_actions)
print("Example annotation:", annots[0])


Total annotations: 545
Number of unique actions: 114
Example annotation: {'video_id': 'P01_01', 'start_frame': 8, 'action_class': 0, 'verb_class': 2, 'noun_class': 8}


In [38]:
model = train_unimodal_flow(
    annots,
    feature_root="path_to_flow_npys",  # update with actual path to your .npy flow features
    num_actions=num_actions,
    context=21,
    batch_size=8,
    epochs=50,
    stride_on_features=5
)


Epoch 1/50 loss=5.3778 acc=0.0257
Epoch 2/50 loss=5.0508 acc=0.0349
Epoch 3/50 loss=4.8978 acc=0.0587
Epoch 4/50 loss=4.6828 acc=0.0495
Epoch 5/50 loss=4.6240 acc=0.0697
Epoch 6/50 loss=4.4994 acc=0.0642
Epoch 7/50 loss=4.4028 acc=0.0514
Epoch 8/50 loss=4.4060 acc=0.0422
Epoch 9/50 loss=4.3206 acc=0.0642
Epoch 10/50 loss=4.2169 acc=0.0697
Epoch 11/50 loss=4.1517 acc=0.0697
Epoch 12/50 loss=4.1153 acc=0.0917
Epoch 13/50 loss=3.9846 acc=0.0936
Epoch 14/50 loss=4.1005 acc=0.0697
Epoch 15/50 loss=4.0295 acc=0.0807
Epoch 16/50 loss=3.9688 acc=0.0844
Epoch 17/50 loss=3.9384 acc=0.0972
Epoch 18/50 loss=3.8627 acc=0.1083
Epoch 19/50 loss=3.8250 acc=0.0936
Epoch 20/50 loss=3.8001 acc=0.0954
Epoch 21/50 loss=3.8128 acc=0.0936
Epoch 22/50 loss=3.7038 acc=0.1083
Epoch 23/50 loss=3.7826 acc=0.1138
Epoch 24/50 loss=3.6790 acc=0.1229
Epoch 25/50 loss=3.5986 acc=0.1266
Epoch 26/50 loss=3.5616 acc=0.1266
Epoch 27/50 loss=3.4346 acc=0.1505
Epoch 28/50 loss=3.5958 acc=0.1266
Epoch 29/50 loss=3.5895 acc=0

In [14]:
dataset = EpicFlowFeatureDataset(
    annots,
    feature_dir="path_to_flow_npys",  # update
    context=21,
    stride_on_features=5,
    label_key="action_class"
)

print("Number of samples:", len(dataset))
if len(dataset) > 0:
    X, y = dataset[0]
    print("One sample shape:", X.shape, "Label:", y)


Number of samples: 545
One sample shape: torch.Size([21, 1024]) Label: tensor(0)


In [42]:
# path setup
test_csv = r"C:\Users\Alis\Desktop\Minor Project\Label\P01_05.csv"
test_feat = r"C:\Users\Alis\Desktop\Minor Project\path_to_flow_npys"

# load test annotations
import pandas as pd

test_annots = []
video_id = "P01_05"
df = pd.read_csv(test_csv)

for _, row in df.iterrows():
    test_annots.append({
        "video_id": video_id,
        "start_frame": int(row["StartFrame"]),
        "end_frame": int(row["EndFrame"]),
        "verb": row["Verb"],
        "verb_class": int(row["Verb_class"]),
        "noun": row["Noun"],
        "noun_class": int(row["Noun_class"]),
        "action_name": row["ActionName"],
        "action_class": int(row["Action_class"]),
    })

print(f"Loaded {len(test_annots)} test annotations")
print("Example:", test_annots[0])


Loaded 259 test annotations
Example: {'video_id': 'P01_05', 'start_frame': 248, 'end_frame': 355, 'verb': 'open', 'verb_class': 2, 'noun': 'fridge', 'noun_class': 10, 'action_name': 'open fridge', 'action_class': 0}


In [43]:
# Build dataset
test_dataset = EpicFlowFeatureDataset(
    test_annots,
    feature_dir=test_feat,
    context=21,
    stride_on_features=5,
    label_key="action_class"
)

test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=0)


In [44]:
def evaluate_model(model, loader, device="cuda"):
    model.eval()
    total, correct = 0, 0
    preds_all, labels_all = [], []

    with torch.no_grad():
        for X, y in loader:
            X, y = X.to(device), y.to(device)
            out = model(X)

            # handle models that return tuple (logits, extra)
            if isinstance(out, tuple):
                out = out[0]

            _, preds = torch.max(out, 1)

            total += y.size(0)
            correct += (preds == y).sum().item()

            preds_all.extend(preds.cpu().numpy())
            labels_all.extend(y.cpu().numpy())

    acc = correct / total if total > 0 else 0
    print(f"Test Accuracy: {acc:.4f}")
    return preds_all, labels_all


In [45]:
preds, labels = evaluate_model(model, test_loader, device="cuda")


Test Accuracy: 0.0039


In [46]:
# save only model weights (recommended)
torch.save(model.state_dict(), "tcn_flow_model.pth")

# OR save full model (not recommended if you plan to change class definition later)
torch.save(model, "tcn_flow_model_full.pth")

print("✅ Model saved successfully!")


✅ Model saved successfully!
