In [6]:
import pandas as pd
import os

def load_annotations(label_dir, selected_videos=None):
    """
    Reads all CSV annotation files and returns a list of dicts.
    """
    annots = []
    for file in os.listdir(label_dir):
        if not file.endswith(".csv"):
            continue
        video_id = os.path.splitext(file)[0]  # e.g. "P01_01"
        if selected_videos and video_id not in selected_videos:
            continue
        
        df = pd.read_csv(os.path.join(label_dir, file))
        
        for _, row in df.iterrows():
            annots.append({
                "video_id": video_id,
                "start_frame": int(row["StartFrame"]),
                "stop_frame": int(row["EndFrame"]),
                "verb": row["Verb"],
                "verb_class": int(row["Verb_class"]),
                "noun": row["Noun"],
                "noun_class": int(row["Noun_class"]),
                "action_class": int(row["Action_class"]),
                "action_name": row["ActionName"]
            })
    return annots

# Load annotations for 4 videos only
annots = load_annotations("Label", selected_videos=["P01_01", "P01_02", "P01_03", "P01_04"])


In [7]:
len(annots)

545

In [22]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

class EpicFeatureDataset(Dataset):
    def __init__(self, annots, frame_root, context=10):
        """
        annots: list of annotation dicts
        frame_root: path to npy files
        context: number of feature frames to use
        """
        self.annots = annots
        self.frame_root = frame_root
        self.context = context
        self.cache = {}   # cache video feats
        self.strides = {} # cache detected stride per video

    def detect_stride(self, video_id, video_feats):
        """Estimate stride by comparing annotation max_frame to feats length"""
        if video_id in self.strides:
            return self.strides[video_id]

        # max stop_frame in annotations for this video
        max_frame = max(a["stop_frame"] for a in self.annots if a["video_id"] == video_id)
        T = len(video_feats)

        # avoid divide by zero
        stride = max(1, max_frame // T)
        self.strides[video_id] = stride
        return stride

    def __len__(self):
        return len(self.annots)

    def __getitem__(self, idx):
        ann = self.annots[idx]
        video_id = ann["video_id"]

        # load features if not cached
        if video_id not in self.cache:
            npy_path = os.path.join(self.frame_root, f"{video_id}_rgb.npy")
            self.cache[video_id] = np.load(npy_path)  # shape (T, 2048)

        video_feats = self.cache[video_id]

        # detect stride for this video
        stride = self.detect_stride(video_id, video_feats)

        # map raw frame index to feature index
        start_idx = ann["start_frame"] // stride
        start_idx = min(start_idx, len(video_feats))  

        # take context features
        feats = video_feats[max(0, start_idx - self.context): start_idx]

        # pad if fewer than context frames
        if feats.shape[0] < self.context:
            pad = np.zeros((self.context - feats.shape[0], video_feats.shape[1]))
            feats = np.vstack([pad, feats])

        feats = torch.tensor(feats).float()  # (context, 2048)
        label = torch.tensor(ann["action_class"]).long()
        return feats, label


In [17]:
from torch.utils.data import DataLoader

dataset = EpicRGBDataset(annots, frame_root="RGB_frames", context=20)
loader = DataLoader(dataset, batch_size=2, shuffle=True)


In [25]:
# Suppose annots already loaded with load_annotations(...)
dataset = EpicFeatureDataset(annots, frame_root="RGB_frames", context=10)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Debug a batch
for feats, labels in loader:
    print(feats,labels)


tensor([[[0.3070, 0.3366, 0.9850,  ..., 0.2543, 0.6544, 0.5742],
         [0.3807, 0.4378, 1.1936,  ..., 0.2716, 0.5284, 0.4920],
         [0.7172, 0.4662, 0.9748,  ..., 0.3635, 0.7305, 0.8680],
         ...,
         [0.2555, 0.6061, 0.8533,  ..., 0.1067, 0.6137, 0.4617],
         [0.2177, 0.9076, 0.5549,  ..., 0.1583, 0.6288, 0.0674],
         [0.1681, 0.7172, 0.6609,  ..., 0.1636, 0.8605, 0.1679]],

        [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]]) tensor([28, 12])
tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0

In [32]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
class SimpleTCN(nn.Module):
    def __init__(self, feat_dim=2048, num_classes=100, hidden=512):
        super().__init__()
        self.conv1 = nn.Conv1d(feat_dim, hidden, 3, padding=1)
        self.conv2 = nn.Conv1d(hidden, hidden // 2, 3, padding=1)
        self.fc = nn.Linear(hidden // 2, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):  # x: (B, T, D)
        x = x.transpose(1, 2)     # (B, D, T)
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = x.mean(-1)            # temporal average pooling
        return self.fc(x)


In [33]:
def get_num_classes(annots):
    classes = [a["action_class"] for a in annots]
    num_classes = max(classes) + 1   # +1 because classes are usually 0-indexed
    print(f"Detected {num_classes} unique classes")
    return num_classes


In [34]:
num_classes = get_num_classes(annots)

Detected 114 unique classes


In [48]:
def train_model(annots, frame_root, num_classes=114, context=10, batch_size=8, epochs=5, lr=1e-3):

    # Dataset + Loader
    # dataset = EpicFeatureDataset(annots, frame_root, context=context)
    # loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Model, Loss, Optimizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SimpleTCN(feat_dim=2048, num_classes=num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Training loop
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0

        for feats, labels in loader:
            feats, labels = feats.to(device), labels.to(device)

            preds = model(feats)
            loss = criterion(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            correct += (preds.argmax(1) == labels).sum().item()
            total += labels.size(0)

        acc = correct / total
        print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(loader):.4f} - Acc: {acc:.4f}")

    return model

In [49]:
# annots = load_annotations("path/to/annots.json")   # however you're loading them
num_classes = 114   # adjust to your dataset
frame_root = "RGB_frames"

model = train_model(
    annots,
    frame_root=frame_root,
    num_classes=num_classes,
    context=10,
    batch_size=8,
    epochs=20,
    lr=1e-3
)


Epoch 1/20 - Loss: 4.6614 - Acc: 0.0514
Epoch 2/20 - Loss: 4.3092 - Acc: 0.0826
Epoch 3/20 - Loss: 4.1575 - Acc: 0.0862
Epoch 4/20 - Loss: 4.0373 - Acc: 0.0899
Epoch 5/20 - Loss: 3.9527 - Acc: 0.0881
Epoch 6/20 - Loss: 3.9118 - Acc: 0.0936
Epoch 7/20 - Loss: 3.8543 - Acc: 0.0991
Epoch 8/20 - Loss: 3.8029 - Acc: 0.1046
Epoch 9/20 - Loss: 3.7748 - Acc: 0.0972
Epoch 10/20 - Loss: 3.7327 - Acc: 0.1083
Epoch 11/20 - Loss: 3.7120 - Acc: 0.1138
Epoch 12/20 - Loss: 3.6870 - Acc: 0.1009
Epoch 13/20 - Loss: 3.6641 - Acc: 0.1046
Epoch 14/20 - Loss: 3.6447 - Acc: 0.1028
Epoch 15/20 - Loss: 3.6079 - Acc: 0.1119
Epoch 16/20 - Loss: 3.5916 - Acc: 0.1046
Epoch 17/20 - Loss: 3.5508 - Acc: 0.1138
Epoch 18/20 - Loss: 3.5306 - Acc: 0.1156
Epoch 19/20 - Loss: 3.5038 - Acc: 0.1211
Epoch 20/20 - Loss: 3.4774 - Acc: 0.0991


In [50]:
import pickle

# Save full model object (weights + architecture)
with open("tcn_model.pkl", "wb") as f:
    pickle.dump(model, f)

model.eval()


SimpleTCN(
  (conv1): Conv1d(2048, 512, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(512, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (fc): Linear(in_features=256, out_features=114, bias=True)
  (relu): ReLU()
)

In [51]:
class ResearchTCN(nn.Module):
    def __init__(self, feat_dim=2048, num_classes=114, hidden=512, num_layers=4, kernel_size=3, dropout=0.5):
        super().__init__()
        layers = []
        in_channels = feat_dim
        for i in range(num_layers):
            layers.append(nn.Conv1d(in_channels, hidden, kernel_size, padding=kernel_size//2))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            in_channels = hidden
        self.tcn = nn.Sequential(*layers)
        self.fc = nn.Linear(hidden, num_classes)

    def forward(self, x):  # x: (B, T, D)
        x = x.transpose(1, 2)   # (B, D, T)
        x = self.tcn(x)         # (B, hidden, T)
        x = x.mean(-1)          # global average pool over time
        return self.fc(x)


In [52]:
def Train_model(annots, frame_root, num_classes=114, context=10, batch_size=32, epochs=20, lr=1e-4, weight_decay=1e-5):
    # dataset = EpicFeatureDataset(annots, frame_root, context=context)
    # loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = ResearchTCN(feat_dim=2048, num_classes=num_classes).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    for epoch in range(epochs):
        total_loss, correct, total = 0, 0, 0
        for feats, labels in loader:
            feats, labels = feats.to(device), labels.to(device)

            preds = model(feats)
            loss = criterion(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            correct += (preds.argmax(1) == labels).sum().item()
            total += labels.size(0)

        acc = correct / total
        print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(loader):.4f} - Acc: {acc:.4f}")

    return model


In [53]:
# Detect number of classes from annotations
num_classes = get_num_classes(annots)  # should be 114

# Train the model
model = Train_model(
    annots,
    frame_root="RGB_frames",
    num_classes=num_classes,
    context=10,        # or adjust to your desired temporal context
    batch_size=32,
    epochs=30,         # you can try 20, 50, or more
    lr=1e-4,
    weight_decay=1e-5
)


Detected 114 unique classes
Epoch 1/30 - Loss: 4.6587 - Acc: 0.0367
Epoch 2/30 - Loss: 4.4754 - Acc: 0.0569
Epoch 3/30 - Loss: 4.3413 - Acc: 0.0679
Epoch 4/30 - Loss: 4.2467 - Acc: 0.0624
Epoch 5/30 - Loss: 4.1773 - Acc: 0.0587
Epoch 6/30 - Loss: 4.1592 - Acc: 0.0844
Epoch 7/30 - Loss: 4.0494 - Acc: 0.0936
Epoch 8/30 - Loss: 3.9921 - Acc: 0.0936
Epoch 9/30 - Loss: 3.9611 - Acc: 0.0954
Epoch 10/30 - Loss: 3.8725 - Acc: 0.0954
Epoch 11/30 - Loss: 3.8216 - Acc: 0.1028
Epoch 12/30 - Loss: 3.7667 - Acc: 0.0936
Epoch 13/30 - Loss: 3.6778 - Acc: 0.1119
Epoch 14/30 - Loss: 3.6398 - Acc: 0.1028
Epoch 15/30 - Loss: 3.6113 - Acc: 0.1009
Epoch 16/30 - Loss: 3.5836 - Acc: 0.1138
Epoch 17/30 - Loss: 3.5324 - Acc: 0.1101
Epoch 18/30 - Loss: 3.5294 - Acc: 0.1083
Epoch 19/30 - Loss: 3.4610 - Acc: 0.1083
Epoch 20/30 - Loss: 3.4224 - Acc: 0.1119
Epoch 21/30 - Loss: 3.4043 - Acc: 0.1266
Epoch 22/30 - Loss: 3.3780 - Acc: 0.1193
Epoch 23/30 - Loss: 3.3707 - Acc: 0.1229
Epoch 24/30 - Loss: 3.3128 - Acc: 0.11

In [54]:
import pickle

# Save full model object (weights + architecture)
with open("Researchtcn_model.pkl", "wb") as f:
    pickle.dump(model, f)

model.eval()


ResearchTCN(
  (tcn): Sequential(
    (0): Conv1d(2048, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (7): ReLU()
    (8): Dropout(p=0.5, inplace=False)
    (9): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (10): ReLU()
    (11): Dropout(p=0.5, inplace=False)
  )
  (fc): Linear(in_features=512, out_features=114, bias=True)
)