In [None]:
!pip install torch_geometric
!pip install multilingual-clip torch

In [None]:
import torch
import torch.nn as nn
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from transformers import BertTokenizer, BertModel
import pandas as pd
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from nltk.translate.bleu_score import sentence_bleu

In [None]:

import pickle

# Path to the saved pickle file
val = "/kaggle/input/msdv-feat-16/val_16_feat.pkl"
train  = "/kaggle/input/msdv-feat-16/train_16_feat.pkl"
test = "/kaggle/input/msdv-feat-16/test_16_feat.pkl"
# Load the features
with open(val, "rb") as f:
    val_graph_features = pickle.load(f)
with open(train, "rb") as f:
    train_graph_features = pickle.load(f)
with open(test, "rb") as f:
    test_graph_features = pickle.load(f)

In [None]:
val = "/kaggle/working/val_am_graph_embeddings.pkl"
with open(val, "rb") as f:
    val_graph_features = pickle.load(f)

In [None]:
val_graph_features['bQJQGoJF7_k_162_169']

---
---

## BERT embedding

---
---

# **Video-Question representation** 
### video feature have quetion understanding

In [None]:
import pickle
import torch
import torch.nn.functional as F
import pandas as pd
from transformers import BertTokenizer, BertModel
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from tqdm import tqdm
import os

# Load question-answer CSV
qa_df = pd.read_csv('/kaggle/input/final-am-qa/val_am_updated_file.csv')

# Load feature file
with open('/kaggle/input/msdv-feat-16/val_16_feat.pkl', 'rb') as f:
    video_data = pickle.load(f)

# Tokenizer and language model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Define STGNN layer with attention mechanism
class AttentionSTGNNLayer(MessagePassing):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__(aggr='add')
        self.key = torch.nn.Linear(in_channels, hidden_channels)
        self.query = torch.nn.Linear(in_channels, hidden_channels)
        self.value = torch.nn.Linear(in_channels, hidden_channels)
        self.out = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        return self.propagate(edge_index, x=x)

    def message(self, x_i, x_j):
        attn_score = (self.query(x_i) * self.key(x_j)).sum(dim=-1, keepdim=True)
        attn_weight = torch.sigmoid(attn_score)
        return attn_weight * self.value(x_j)

    def update(self, aggr_out):
        return F.relu(self.out(aggr_out))

# Initialize layers
stgnn1 = AttentionSTGNNLayer(768, 512, 512)
stgnn2 = AttentionSTGNNLayer(512, 256, 256)

# Prepare projection layers after inferring input dimensions
sample_video = next(iter(video_data.values()))
sample_tensor = torch.tensor(sample_video['object_features'][0]) if not isinstance(sample_video['object_features'][0], torch.Tensor) else sample_video['object_features'][0]
feat_dim = sample_tensor.size(1)
proj_obj = torch.nn.Linear(feat_dim, 768)
proj_q = torch.nn.Linear(768, 768)

# Load previously saved features if available
output_path = '/kaggle/working/val_am_graph_embeddings.pkl'
if os.path.exists(output_path):
    with open(output_path, 'rb') as f:
        features_dict = pickle.load(f)
else:
    features_dict = {}

for idx, row in tqdm(qa_df.iterrows(), total=len(qa_df), desc="Processing videos"):
    video_id = row['video_id']
    question = row['question']

    if video_id in features_dict:
        continue

    if not isinstance(question, str) or not question.strip():
        continue

    if video_id not in video_data:
        continue

    object_features = video_data[video_id]['object_features']

    inputs = tokenizer(question, return_tensors='pt')
    with torch.no_grad():
        question_feat = bert_model(**inputs).last_hidden_state[:, 0, :]

    if question_feat.size(1) == 0:
        continue

    q_feat_proj = proj_q(question_feat)

    nodes = []
    edge_index = []
    node_offset = 0
    frame_node_counts = []

    for t, frame_feats in enumerate(object_features):
        frame_feats = torch.tensor(frame_feats) if not isinstance(frame_feats, torch.Tensor) else frame_feats
        if frame_feats.size(0) == 0:
            frame_node_counts.append(0)
            continue

        num_objs = frame_feats.size(0)
        obj_proj = proj_obj(frame_feats)
        q_proj = q_feat_proj.repeat(num_objs, 1)
        combined_feat = obj_proj + q_proj
        nodes.append(combined_feat)

        indices = torch.arange(num_objs) + node_offset
        spatial = torch.combinations(indices, r=2).T
        edge_index.append(torch.cat([spatial, spatial[[1, 0]]], dim=1))

        if len(frame_node_counts) > 0 and frame_node_counts[-1] > 0:
            prev_num = frame_node_counts[-1]
            curr_idx = torch.arange(num_objs) + node_offset
            prev_idx = torch.arange(prev_num) + node_offset - prev_num
            temporal = torch.cartesian_prod(prev_idx, curr_idx).T
            edge_index.append(temporal)

        frame_node_counts.append(num_objs)
        node_offset += num_objs

    if not nodes:
        continue

    x = torch.cat(nodes, dim=0)
    edge_index = torch.cat(edge_index, dim=1)
    data = Data(x=x, edge_index=edge_index)

    x = stgnn1(data.x, data.edge_index)
    x = stgnn2(x, data.edge_index)
    graph_embedding = x.mean(dim=0)

    features_dict[video_id] = graph_embedding.cpu()

    # Save after each video
    with open(output_path, 'wb') as f:
        pickle.dump(features_dict, f)

print("Saved graph-level features for all videos.")

# ***Training***

In [None]:
import os
import pickle
import torch
import torch.nn.functional as F
import pandas as pd
from multilingual_clip import pt_multilingual_clip
import transformers
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from tqdm import tqdm

# Load question-answer CSV
qa_df = pd.read_csv('/kaggle/input/embedding/test_graph_embeddings.pkl')

# Load feature file
with open('/kaggle/input/msdv-feat-16/test_16_feat.pkl', 'rb') as f:
    video_data = pickle.load(f)

# Load M-CLIP text encoder
model_name = 'M-CLIP/XLM-Roberta-Large-Vit-B-32'
clip_text_model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(model_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

# Define STGNN layer with attention mechanism
class AttentionSTGNNLayer(MessagePassing):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__(aggr='add')
        self.key = torch.nn.Linear(in_channels, hidden_channels)
        self.query = torch.nn.Linear(in_channels, hidden_channels)
        self.value = torch.nn.Linear(in_channels, hidden_channels)
        self.out = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        return self.propagate(edge_index, x=x)

    def message(self, x_i, x_j):
        attn_score = (self.query(x_i) * self.key(x_j)).sum(dim=-1, keepdim=True)
        attn_weight = torch.sigmoid(attn_score)
        return attn_weight * self.value(x_j)

    def update(self, aggr_out):
        return F.relu(self.out(aggr_out))

# Initialize layers
stgnn1 = AttentionSTGNNLayer(512, 512, 512)
stgnn2 = AttentionSTGNNLayer(512, 256, 256)

# Prepare projection layer based on object feature dimension
sample_video = next(iter(video_data.values()))
sample_tensor = torch.tensor(sample_video['object_features'][0]) if not isinstance(sample_video['object_features'][0], torch.Tensor) else sample_video['object_features'][0]
feat_dim = sample_tensor.size(1)
proj_obj = torch.nn.Linear(feat_dim, 512)

# Load previously saved features if available
output_path = '/kaggle/working/test_clip_graph_embeddings_am.pkl'
if os.path.exists(output_path):
    with open(output_path, 'rb') as f:
        features_dict = pickle.load(f)
else:
    features_dict = {}

for idx, row in tqdm(qa_df.iterrows(), total=len(qa_df), desc="Processing videos"):
    video_id = row['video_id']
    question = row['question']

    if video_id in features_dict:
        continue

    if not isinstance(question, str) or not question.strip():
        continue

    if video_id not in video_data:
        continue

    object_features = video_data[video_id]['object_features']

    with torch.no_grad():
        question_feat = clip_text_model.forward([question], tokenizer)  # shape: [1, 512]

    if question_feat.size(1) == 0:
        continue

    q_feat_proj = question_feat  # already 512-dim

    nodes = []
    edge_index = []
    node_offset = 0
    frame_node_counts = []

    for t, frame_feats in enumerate(object_features):
        frame_feats = torch.tensor(frame_feats) if not isinstance(frame_feats, torch.Tensor) else frame_feats
        if frame_feats.size(0) == 0:
            frame_node_counts.append(0)
            continue

        num_objs = frame_feats.size(0)
        obj_proj = proj_obj(frame_feats)
        q_proj = q_feat_proj.repeat(num_objs, 1)
        combined_feat = obj_proj + q_proj
        nodes.append(combined_feat)

        indices = torch.arange(num_objs) + node_offset
        spatial = torch.combinations(indices, r=2).T
        edge_index.append(torch.cat([spatial, spatial[[1, 0]]], dim=1))

        if len(frame_node_counts) > 0 and frame_node_counts[-1] > 0:
            prev_num = frame_node_counts[-1]
            curr_idx = torch.arange(num_objs) + node_offset
            prev_idx = torch.arange(prev_num) + node_offset - prev_num
            temporal = torch.cartesian_prod(prev_idx, curr_idx).T
            edge_index.append(temporal)

        frame_node_counts.append(num_objs)
        node_offset += num_objs

    if not nodes:
        continue

    x = torch.cat(nodes, dim=0)
    edge_index = torch.cat(edge_index, dim=1)
    data = Data(x=x, edge_index=edge_index)

    x = stgnn1(data.x, data.edge_index)
    x = stgnn2(x, data.edge_index)
    graph_embedding = x.mean(dim=0)

    features_dict[video_id] = graph_embedding.cpu()

    # Save after each video
    with open(output_path, 'wb') as f:
        pickle.dump(features_dict, f)

print("Saved graph-level features for all videos.")

In [None]:
import pickle
import torch
import torch.nn.functional as F
import pandas as pd
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --------------------
# Load Data
# --------------------
qa_df = pd.read_csv('/kaggle/input/msdv-qa-csv/train_q_a_f.csv')

with open('/kaggle/input/msdv-feat-16/train_16_feat.pkl', 'rb') as f:
    train_feature_data = pickle.load(f)
with open('/kaggle/input/msdv-feat-16/val_16_feat.pkl', 'rb') as f:
    val_feature_data = pickle.load(f)
with open('/kaggle/input/msdv-feat-16/test_16_feat.pkl', 'rb') as f:
    test_feature_data = pickle.load(f)

with open('/kaggle/input/embedding/train_graph_embeddings.pkl', 'rb') as f:
    train_graph_features = pickle.load(f)
with open('/kaggle/input/embedding/val_graph_embeddings.pkl', 'rb') as f:
    val_graph_features = pickle.load(f)
with open('/kaggle/input/embedding/test_graph_embeddings.pkl', 'rb') as f:
    test_graph_features = pickle.load(f)

# --------------------
# Video Splits
# --------------------
train_ids = set(train_feature_data.keys())
val_ids = set(val_feature_data.keys())
test_ids = set(test_feature_data.keys())

# --------------------
# Text Encoder (BERT)
# --------------------
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.to(device)
bert_model.eval()

# --------------------
# Label Encode Answers
# --------------------
qa_df = qa_df[qa_df['video_id'].isin(train_ids | val_ids | test_ids) & 
              qa_df['question'].notnull() & 
              qa_df['answer'].notnull()]
label_encoder = LabelEncoder()
qa_df['label'] = label_encoder.fit_transform(qa_df['answer'])

# ----------------------------
# Positional Encoding Function
# ----------------------------
def get_positional_encoding(length, dim):
    position = torch.arange(0, length, dtype=torch.float, device=device).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, dim, 2, device=device).float() * (-torch.log(torch.tensor(10000.0, device=device)) / dim))
    pe = torch.zeros(length, dim, device=device)
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe

# ----------------------------
# Dataset Definition
# ----------------------------
class QAGraphDataset(Dataset):
    def __init__(self, df, graph_feats, full_feats):
        self.df = df.reset_index(drop=True)
        self.graph_feats = graph_feats
        self.full_feats = full_feats

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        vid = row['video_id']
        question = row['question']
        label = row['label']

        if vid not in self.graph_feats or vid not in self.full_feats:
            return torch.zeros(1, 1536, device=device), torch.tensor(-1, device=device)

        with torch.no_grad():
            inputs = tokenizer(question, return_tensors='pt', padding=True, truncation=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            q_feat = bert_model(**inputs).last_hidden_state.squeeze(0)  # [L, 768]

        L = q_feat.size(0)
        pos_embed = get_positional_encoding(L, q_feat.size(1))
        q_feat = q_feat + pos_embed

        g_feat = self.graph_feats[vid].to(device).unsqueeze(0).repeat(L, 1)  # [L, 256]

        frame_feats = self.full_feats[vid]['frame_features']
        if isinstance(frame_feats, list):
            frame_feats = [torch.tensor(f, device=device) if not isinstance(f, torch.Tensor) else f.to(device) for f in frame_feats]
            frame_feats = torch.cat(frame_feats, dim=0)
        if frame_feats.dim() == 3:
            frame_feats = frame_feats.squeeze(1)
        else:
            frame_feats = frame_feats.to(device)
        f_feat = frame_feats.mean(dim=0).unsqueeze(0).repeat(L, 1)

        fusion = torch.cat([g_feat, f_feat, q_feat], dim=-1)  # [L, 1536]
        return fusion, torch.tensor(label, device=device)

# ----------------------------
# Collate Function
# ----------------------------
def collate_fn(batch):
    filtered = [(x, y) for x, y in batch if y.item() >= 0]
    if not filtered:
        return torch.zeros(1, 1, 1536, device=device), torch.tensor([-1], device=device)
    x_seqs, labels = zip(*filtered)
    x_padded = pad_sequence(x_seqs, batch_first=True)
    y = torch.stack(labels)
    return x_padded, y

# ----------------------------
# Dataset Split
# ----------------------------
train_df = qa_df[qa_df['video_id'].isin(train_ids)]
val_df = qa_df[qa_df['video_id'].isin(val_ids)]
test_df = qa_df[qa_df['video_id'].isin(test_ids)]

train_loader = DataLoader(QAGraphDataset(train_df, train_graph_features, train_feature_data), 
                          batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(QAGraphDataset(val_df, val_graph_features, val_feature_data), 
                        batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(QAGraphDataset(test_df, test_graph_features, test_feature_data), 
                         batch_size=32, shuffle=False, collate_fn=collate_fn)

# ----------------------------
# Model: Attention Classifier
# ----------------------------
class AttentionClassifier(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super().__init__()
        self.attn = torch.nn.MultiheadAttention(embed_dim=input_dim, num_heads=4, batch_first=True)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(input_dim, hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        attn_output, _ = self.attn(x, x, x)
        pooled = attn_output.mean(dim=1)
        return self.mlp(pooled)

model = AttentionClassifier(256 + 512 + 768, 512, len(label_encoder.classes_))
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss()

start_epoch = 1
checkpoint_path = "/kaggle/working/clip_checkpoint.pth"

if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f"Resuming training from epoch {start_epoch} with saved loss {checkpoint['loss']:.4f}")

def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for x, y in loader:
            if y[0] < 0:
                continue
            out = model(x)
            correct += (out.argmax(dim=1) == y).sum().item()
            total += x.size(0)
    return 100. * correct / total if total > 0 else 0.0

for epoch in range(start_epoch, 31):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch}")
    for x, y in loop:
        if y[0] < 0:
            continue
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)
        correct += (out.argmax(dim=1) == y).sum().item()
        total += x.size(0)
        loop.set_postfix(loss=total_loss / total, acc=100. * correct / total)

    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch} complete | Train Loss: {total_loss/total:.4f} | Train Acc: {100. * correct/total:.2f}% | Val Acc: {val_acc:.2f}%")

    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': total_loss / total,
    }
    torch.save(checkpoint, checkpoint_path)


Epoch 1: 100%|██████████| 967/967 [05:18<00:00,  3.04it/s, acc=23.4, loss=4.58]


Epoch 1 complete | Train Loss: 4.5801 | Train Acc: 23.40% | Val Acc: 0.00%


Epoch 2: 100%|██████████| 967/967 [05:16<00:00,  3.06it/s, acc=33, loss=3.48]  


Epoch 2 complete | Train Loss: 3.4794 | Train Acc: 33.05% | Val Acc: 0.00%


Epoch 3: 100%|██████████| 967/967 [05:16<00:00,  3.06it/s, acc=39.5, loss=2.95]


Epoch 3 complete | Train Loss: 2.9499 | Train Acc: 39.48% | Val Acc: 0.00%


Epoch 4: 100%|██████████| 967/967 [05:15<00:00,  3.06it/s, acc=45, loss=2.54]  


Epoch 4 complete | Train Loss: 2.5427 | Train Acc: 44.97% | Val Acc: 0.00%


Epoch 5: 100%|██████████| 967/967 [05:14<00:00,  3.08it/s, acc=49.2, loss=2.22]


Epoch 5 complete | Train Loss: 2.2207 | Train Acc: 49.24% | Val Acc: 0.00%


Epoch 6: 100%|██████████| 967/967 [05:12<00:00,  3.09it/s, acc=53.5, loss=1.95]


Epoch 6 complete | Train Loss: 1.9505 | Train Acc: 53.54% | Val Acc: 0.00%


Epoch 7: 100%|██████████| 967/967 [05:14<00:00,  3.07it/s, acc=56.6, loss=1.75]


Epoch 7 complete | Train Loss: 1.7501 | Train Acc: 56.64% | Val Acc: 0.00%


Epoch 8: 100%|██████████| 967/967 [05:15<00:00,  3.07it/s, acc=59.4, loss=1.58]


Epoch 8 complete | Train Loss: 1.5820 | Train Acc: 59.36% | Val Acc: 0.00%


Epoch 9: 100%|██████████| 967/967 [05:13<00:00,  3.08it/s, acc=61.5, loss=1.44]


Epoch 9 complete | Train Loss: 1.4364 | Train Acc: 61.54% | Val Acc: 0.00%


Epoch 10: 100%|██████████| 967/967 [05:14<00:00,  3.07it/s, acc=63.4, loss=1.32]


Epoch 10 complete | Train Loss: 1.3197 | Train Acc: 63.38% | Val Acc: 0.00%


Epoch 11: 100%|██████████| 967/967 [05:13<00:00,  3.09it/s, acc=65.6, loss=1.21]


Epoch 11 complete | Train Loss: 1.2134 | Train Acc: 65.61% | Val Acc: 0.00%


Epoch 12: 100%|██████████| 967/967 [05:15<00:00,  3.07it/s, acc=67, loss=1.14]  


Epoch 12 complete | Train Loss: 1.1387 | Train Acc: 66.96% | Val Acc: 0.00%


Epoch 13: 100%|██████████| 967/967 [05:14<00:00,  3.07it/s, acc=68.6, loss=1.06] 


Epoch 13 complete | Train Loss: 1.0589 | Train Acc: 68.58% | Val Acc: 0.00%


Epoch 14: 100%|██████████| 967/967 [05:14<00:00,  3.08it/s, acc=69.7, loss=0.995]


Epoch 14 complete | Train Loss: 0.9948 | Train Acc: 69.67% | Val Acc: 0.00%


Epoch 15: 100%|██████████| 967/967 [05:13<00:00,  3.08it/s, acc=71.3, loss=0.939]


Epoch 15 complete | Train Loss: 0.9394 | Train Acc: 71.33% | Val Acc: 0.00%


Epoch 16: 100%|██████████| 967/967 [05:13<00:00,  3.08it/s, acc=72, loss=0.903]  


Epoch 16 complete | Train Loss: 0.9027 | Train Acc: 71.99% | Val Acc: 0.00%


Epoch 17: 100%|██████████| 967/967 [05:14<00:00,  3.08it/s, acc=73.1, loss=0.857]


Epoch 17 complete | Train Loss: 0.8573 | Train Acc: 73.07% | Val Acc: 0.00%


Epoch 18: 100%|██████████| 967/967 [05:12<00:00,  3.10it/s, acc=74.3, loss=0.816]


Epoch 18 complete | Train Loss: 0.8162 | Train Acc: 74.25% | Val Acc: 0.00%


Epoch 19:  31%|███       | 298/967 [01:37<03:35,  3.11it/s, acc=76.4, loss=0.733]

In [None]:
!pip install magic-wormhole

In [None]:
!wormhole send /kaggle/working/clip_checkpoint.pth

# Object Tracking

In [None]:
import numpy as np

def compute_iou(box1, box2):
    """Calculate Intersection over Union (IoU) between two bounding boxes"""
    x1, y1, x2, y2 = box1
    x1_p, y1_p, x2_p, y2_p = box2

    # Compute intersection
    inter_x1 = max(x1, x1_p)
    inter_y1 = max(y1, y1_p)
    inter_x2 = min(x2, x2_p)
    inter_y2 = min(y2, y2_p)

    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
    
    # Compute areas
    box1_area = (x2 - x1) * (y2 - y1)
    box2_area = (x2_p - x1_p) * (y2_p - y1_p)
    
    # Compute IoU
    iou = inter_area / float(box1_area + box2_area - inter_area)
    return iou

def track_objects_across_frames(video_features, iou_threshold=0.5):
    """Assign tracking IDs to objects appearing across frames"""
    tracking_info = {}
    object_id = 1  # Initialize first object ID
    
    prev_frame_objects = {}  # Store objects in previous frame
    
    for frame_idx, (objects, boxes) in enumerate(zip(video_features["object_features"], video_features["bounding_boxes"])):
        frame_objects = {}

        for obj_idx, box in enumerate(boxes):
            best_match_id = None
            max_iou = iou_threshold  # Only assign if IoU is above threshold
            
            for prev_idx, prev_box in prev_frame_objects.items():
                iou = compute_iou(box, prev_box)
                if iou > max_iou:
                    best_match_id = prev_idx
                    max_iou = iou
            
            if best_match_id is not None:
                frame_objects[obj_idx] = tracking_info[(frame_idx - 1, best_match_id)]
            else:
                frame_objects[obj_idx] = object_id
                object_id += 1

            tracking_info[(frame_idx, obj_idx)] = frame_objects[obj_idx]
        
        prev_frame_objects = {idx: box for idx, box in enumerate(boxes)}

    return tracking_info
    
def process_all_videos(all_video_features):
    all_video_tracking_info = {}

    # Iterate through all videos and track objects across frames
    for video_id, video_features in all_video_features.items():
        print(f"Processing video: {video_id}")
        
        # Track objects for the current video
        tracking_info = track_objects_across_frames(video_features)

        # Store the tracking information for the video
        all_video_tracking_info[video_id] = tracking_info
        print("added")
    return all_video_tracking_info
val_video_tracking_info = process_all_videos(val_graph_features)
train_video_tracking_info = process_all_videos(train_graph_features)
test_video_tracking_info = process_all_videos(test_graph_features)

In [None]:
val_video_tracking_info['bQJQGoJF7_k_162_169']

# Graph Building

In [None]:
import numpy as np
from scipy.spatial.distance import cdist

def build_graph_features(video_features, tracking_info):
    frame_features_list = video_features["frame_features"]
    object_features_list = video_features["object_features"]
    bounding_boxes = video_features["bounding_boxes"]

    num_frames = len(frame_features_list)
    num_objects = sum(len(objects) for objects in object_features_list)  

    # Initialize adjacency matrix and feature tensor
    adjacency_matrix = np.zeros((num_objects, num_objects))
    feature_tensor = []

    object_index_map = {}  # Maps (frame_idx, obj_idx) → global object index
    global_obj_idx = 0

    for frame_idx, objects in enumerate(object_features_list):
        for obj_idx, obj_feature in enumerate(objects):
            feature_tensor.append(obj_feature)
            object_index_map[(frame_idx, obj_idx)] = global_obj_idx
            global_obj_idx += 1

    feature_tensor = np.array(feature_tensor)  

    # ---- SPATIAL EDGES (within the same frame) ----
    for frame_idx in range(num_frames):
        if not bounding_boxes[frame_idx]: 
            continue  # Skip empty frames

        bbox_centers = [( (box[0] + box[2]) / 2, (box[1] + box[3]) / 2 ) for box in bounding_boxes[frame_idx]]
        bbox_centers = np.array(bbox_centers)

        if bbox_centers.shape[0] == 0:
            continue  

        dist_matrix = cdist(bbox_centers, bbox_centers)

        # Lower threshold to 100 or more for more edges
        for i in range(len(bbox_centers)):
            for j in range(i + 1, len(bbox_centers)):
                if dist_matrix[i, j] < 100:  # Increase threshold for better connectivity
                    obj1 = object_index_map[(frame_idx, i)]
                    obj2 = object_index_map[(frame_idx, j)]
                    adjacency_matrix[obj1, obj2] = 1
                    adjacency_matrix[obj2, obj1] = 1

    # ---- TEMPORAL EDGES (linking objects across frames) ----
    for (prev_frame, prev_obj), track_id in tracking_info.items():
        for (curr_frame, curr_obj), curr_track_id in tracking_info.items():
            if curr_frame == prev_frame + 1 and curr_track_id == track_id:  
                obj1 = object_index_map[(prev_frame, prev_obj)]
                obj2 = object_index_map[(curr_frame, curr_obj)]
                adjacency_matrix[obj1, obj2] = 1
                adjacency_matrix[obj2, obj1] = 1

    return adjacency_matrix, feature_tensor


In [None]:
def process_all_videos(video_object_tracking, video_object_feature):
    all_video_graphs = {}  # Store graphs for each video
    
    for video_id in video_object_tracking.keys():
        tracking_info = video_object_tracking[video_id]  # Object tracking data
        video_features = video_object_feature[video_id]  # Extracted features

        adjacency_matrix, feature_tensor = build_graph_features(video_features, tracking_info)
        
        all_video_graphs[video_id] = {
            "adjacency_matrix": adjacency_matrix,
            "feature_matrix": feature_tensor
        }

    return all_video_graphs
val_graph = process_all_videos(val_video_tracking_info, val_graph_features)
train_graph = process_all_videos(train_video_tracking_info, train_graph_features)
test_graph = process_all_videos(test_video_tracking_info, test_graph_features)

In [None]:
# all_video_graphs = process_all_videos(all_video_tracking_info, video_graph_features)

In [None]:
val_graph['bQJQGoJF7_k_162_169']['feature_matrix'].shape

In [None]:
val_graph['bQJQGoJF7_k_162_169']['adjacency_matrix'].shape

In [None]:
all_video_graphs['bQJQGoJF7_k_162_169']

# Graph feature extraction using stgcnn

In [None]:
# Load adjacency and feature matrices
video_graph = val_graph['bQJQGoJF7_k_162_169']
adj_matrix = video_graph['adjacency_matrix']
feat_matrix = video_graph['feature_matrix']

In [None]:
class VideoSTGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(VideoSTGNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x.mean(dim=0)  # Aggregate all nodes to a single representation


In [None]:
#Convert adjacency matrix to edge list
edge_index = torch.tensor(adj_matrix.nonzero(), dtype=torch.long)

# Convert feature matrix to PyTorch tensor
x = torch.tensor(feat_matrix, dtype=torch.float)

# Create PyG graph object
graph_data = Data(x=x, edge_index=edge_index)

In [None]:
input_dim = x.shape[1]  # Feature dimension from the graph
stgnn = VideoSTGNN(input_dim=input_dim, hidden_dim=128, output_dim=256)

# for all the video
### Now, let’s scale the pipeline to handle 100 videos, extracting spatio-temporal graph features (STGNN) for each video and processing corresponding questions.



In [None]:
import torch
from torch_geometric.data import Data
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class VideoSTGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(VideoSTGNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x.mean(dim=0)  # Aggregate all nodes into a single video representation

# Model Definition
input_dim = 1000  
stgnn = VideoSTGNN(input_dim=input_dim, hidden_dim=128, output_dim=256)

# Initialize a list to store aggregated features
val_video_features = []
test_video_features = []
train_video_features = []

for video_id, video_data in val_graph.items():
    adj_matrix = video_data['adjacency_matrix']
    feat_matrix = video_data['feature_matrix']

    # Skip videos with no edges or missing features
    if adj_matrix.sum() == 0 or feat_matrix.shape[0] == 0:
        print(f"Skipping video {video_id} due to missing edges or features.")
        continue

    # Convert adjacency matrix to edge list
    edge_index = torch.tensor(adj_matrix.nonzero(), dtype=torch.long)

    # Convert feature matrix to PyTorch tensor
    x = torch.tensor(feat_matrix, dtype=torch.float)

    # Ensure edge index doesn't exceed available nodes
    if edge_index.max() >= x.shape[0]:
        print(f"Skipping video {video_id} due to invalid edge index.")
        continue

    print(f"Processing Video: {video_id}")
    print(f"Feature matrix shape: {x.shape}")
    print(f"Edge index shape: {edge_index.shape}")

    # Create PyG graph object
    graph_data = Data(x=x, edge_index=edge_index)

    # Process graph through STGNN
    with torch.no_grad():
        video_feature = stgnn(graph_data).unsqueeze(0)  # Shape: (1, 256)

    # Store the features
    val_video_features.append((video_id, video_feature))
for video_id, video_data in train_graph.items():
    adj_matrix = video_data['adjacency_matrix']
    feat_matrix = video_data['feature_matrix']

    # Skip videos with no edges or missing features
    if adj_matrix.sum() == 0 or feat_matrix.shape[0] == 0:
        print(f"Skipping video {video_id} due to missing edges or features.")
        continue

    # Convert adjacency matrix to edge list
    edge_index = torch.tensor(adj_matrix.nonzero(), dtype=torch.long)

    # Convert feature matrix to PyTorch tensor
    x = torch.tensor(feat_matrix, dtype=torch.float)

    # Ensure edge index doesn't exceed available nodes
    if edge_index.max() >= x.shape[0]:
        print(f"Skipping video {video_id} due to invalid edge index.")
        continue

    print(f"Processing Video: {video_id}")
    print(f"Feature matrix shape: {x.shape}")
    print(f"Edge index shape: {edge_index.shape}")

    # Create PyG graph object
    graph_data = Data(x=x, edge_index=edge_index)

    # Process graph through STGNN
    with torch.no_grad():
        video_feature = stgnn(graph_data).unsqueeze(0)  # Shape: (1, 256)

    # Store the features
    train_video_features.append((video_id, video_feature))
for video_id, video_data in test_graph.items():
    adj_matrix = video_data['adjacency_matrix']
    feat_matrix = video_data['feature_matrix']

    # Skip videos with no edges or missing features
    if adj_matrix.sum() == 0 or feat_matrix.shape[0] == 0:
        print(f"Skipping video {video_id} due to missing edges or features.")
        continue

    # Convert adjacency matrix to edge list
    edge_index = torch.tensor(adj_matrix.nonzero(), dtype=torch.long)

    # Convert feature matrix to PyTorch tensor
    x = torch.tensor(feat_matrix, dtype=torch.float)

    # Ensure edge index doesn't exceed available nodes
    if edge_index.max() >= x.shape[0]:
        print(f"Skipping video {video_id} due to invalid edge index.")
        continue

    print(f"Processing Video: {video_id}")
    print(f"Feature matrix shape: {x.shape}")
    print(f"Edge index shape: {edge_index.shape}")

    # Create PyG graph object
    graph_data = Data(x=x, edge_index=edge_index)

    # Process graph through STGNN
    with torch.no_grad():
        video_feature = stgnn(graph_data).unsqueeze(0)  # Shape: (1, 256)

    # Store the features
    test_video_features.append((video_id, video_feature))
# Print first 5 video features
for video_id, feature in test_video_features[:5]:
    print(f"Video ID: {video_id}, Feature Shape: {feature.shape}")

In [None]:
val_video_features[97][1]

## Now that i have graph-based video embeddings and a CSV file with video IDs, questions, and answers, the next step is training a Video Q&A model.



In [None]:
# Load the CSV file containing questions, answers, and video IDs
test_qa = pd.read_csv('/kaggle/input/msdv-qa-csv/test_q_a_f.csv')
train_qa = pd.read_csv('/kaggle/input/msdv-qa-csv/train_q_a_f.csv')
val_qa = pd.read_csv('/kaggle/input/msdv-qa-csv/val_q_a_f.csv')
# Check the first few rows of the CSV
print(train_qa.head())

In [None]:
video_ids =[]
for i in range(97):
    video_ids.append(val_video_features[i][0])

In [None]:
len(video_ids)

In [None]:
import pandas as pd

# --- Step 1: Read the mapping file ---
mapping_file = "/kaggle/input/msvd-q-and-a/youtube_mapping.txt"  # Adjust path if needed

# Build a dictionary mapping the numeric part of the short id to the exact id.
mapping_dict = {}
with open(mapping_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) >= 2:
            exact_id, short_id = parts[0], parts[1]
            # Remove "vid" prefix from the short id, e.g. "vid1" becomes "1"
            num_part = short_id.replace("vid", "")
            mapping_dict[num_part] = exact_id

print("Mapping dictionary:", mapping_dict)
# Expected output, for example: {"1": "-4wsuPCjDBc_5_15", "2": "-7KMZQEsJW4_205_208", ...}

# --- Step 2: Read the CSV file ---
csv_file = "/kaggle/input/val-am-new/am_val_q_a (1).csv"  # Adjust path if necessary
df = pd.read_csv(csv_file)
print("Original CSV video_ids:")
print(df["video_id"].unique())

# --- Step 3: Replace the video_id in CSV using the mapping ---
# Assume that the CSV's video_id column contains values like "1", "2", etc.
df["video_id"] = df["video_id"].astype(str).map(mapping_dict)

# --- Step 4: Save the updated CSV ---
output_csv_file = "qa_data_updated.csv"  # New file name for the updated CSV
df.to_csv(output_csv_file, index=False)
print(f"Updated CSV saved to {output_csv_file}")


In [None]:
train_qa.columns = train_qa.columns.str.strip()  # Remove any leading or trailing spaces in column names
test_qa.columns = test_qa.columns.str.strip() 
val_qa.columns = val_qa.columns.str.strip() 

# Merge all three datasets
merged_qa = pd.concat([train_qa, test_qa, val_qa], ignore_index=True)

# Check the first few rows
print(len(merged_qa))

In [None]:
# Verify the list of video_ids that are present in the DataFrame
matching_video_ids = df[df['video_id'].isin(video_ids)]
print(len(matching_video_ids['video_id'].unique()))

The model is trained with the **answer labels** as the target variable. The answers are encoded into numerical labels using the **LabelEncoder**. Here's a summary of how the training process works:

1. **Question-Video Feature Concatenation**:  
   - Each **video embedding** and **question embedding** (from BERT) are concatenated into a single feature vector. The feature size is the combination of the 256-dimensional video feature and the 768-dimensional question embedding, resulting in a 1024-dimensional feature vector.

2. **Training Target**:  
   - The **answer** for each video-question pair is encoded into a numerical label using `LabelEncoder`. This label is the target variable `y` that the model will predict.

3. **Model Prediction**:  
   - The model (a simple **MLP** in this case) is trained to predict the **answer label** based on the concatenated video-question feature.

4. **Cross-Entropy Loss**:  
   - The **CrossEntropyLoss** is used, which is suitable for multi-class classification. The model is learning to output a probability distribution over all possible answers, and the goal is to minimize the loss between the predicted and actual answer labels.

In [None]:
val_video_embeddings_dict = {video_id: feature for video_id, feature in val_video_features}
train_video_embeddings_dict = {video_id: feature for video_id, feature in train_video_features}
test_video_embeddings_dict = {video_id: feature for video_id, feature in test_video_features}

In [None]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

In [None]:
# Encode question function using BERT
def encode_question(question):
    inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True, max_length=50)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :]  # CLS token embedding (768D)

In [None]:
from tqdm import tqdm  # Import tqdm for progress tracking

# Merge the dictionaries
video_embeddings_dict = {
    **train_video_embeddings_dict, 
    **val_video_embeddings_dict, 
    **test_video_embeddings_dict
}

# Prepare training data
X, y = [], []
for index, row in tqdm(merged_qa.iterrows(), total=len(merged_qa), desc="Processing QA Pairs"):
    video_id, question, answer = row["video_id"], row["question"], row["answer"]
    
    # Ensure video embedding exists
    if video_id not in video_embeddings_dict:
        continue
    
    # Get video embedding (256D)
    video_embedding = video_embeddings_dict[video_id]

    # Ensure video embedding is a tensor
    if isinstance(video_embedding, list):
        video_embedding = torch.tensor(video_embedding, dtype=torch.float32)
    
    # Encode question (768D)
    question_embedding = encode_question(question)

    # Ensure question embedding is a tensor
    if isinstance(question_embedding, list):
        question_embedding = torch.tensor(question_embedding, dtype=torch.float32)
    
    # Ensure both tensors have the same dimension format
    if video_embedding.dim() == 1:
        video_embedding = video_embedding.unsqueeze(0)  # Convert to (1, 256)
    if question_embedding.dim() == 1:
        question_embedding = question_embedding.unsqueeze(0)  # Convert to (1, 768)

    # Concatenate video and question embeddings → (1, 1024)
    combined_feature = torch.cat((video_embedding, question_embedding), dim=1)

    X.append(combined_feature)
    y.append(answer)  # Store corresponding answer label

In [None]:
y[0],X[0],X[0].shape

In [None]:
# Convert to tensor
X = torch.vstack(X)  # Shape: (num_samples, 1024)
X[0].shape

In [None]:
# Encode answer labels (this turns answers into numerical values)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Convert answers to PyTorch tensor
y_tensor = torch.tensor(y_encoded, dtype=torch.long)

In [None]:
y_tensor.shape

In [None]:
# Create a DataLoader for batching
dataset = TensorDataset(X, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
# Define the Video Q&A model (MLP)
class VideoQAModel(nn.Module):
    def __init__(self, input_dim=1024, hidden_dim=512, num_classes=len(label_encoder.classes_)):
        super(VideoQAModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [None]:
# Initialize the model
vqa_model = VideoQAModel()

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vqa_model.parameters(), lr=0.001)

In [None]:
# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = vqa_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}")

In [None]:
# Example: Predict answer for a new video-question pair
test_video_id = "bQJQGoJF7_k_162_169"  # Replace with actual video ID
test_question = "ስጋን የሚጨምር ማን ነው?"

# Get video embedding for the test video
test_video_embedding = video_embeddings_dict[test_video_id]

# Encode the question
test_question_embedding = encode_question(test_question)

# Concatenate features for prediction (256D + 768D = 1024D)
test_input = torch.cat((test_video_embedding, test_question_embedding), dim=1)

# Predict answer
with torch.no_grad():
    output = vqa_model(test_input)
    predicted_label = torch.argmax(output).item()

# Convert predicted label to answer text
predicted_answer = label_encoder.inverse_transform([predicted_label])[0]
print(f"Predicted Answer: {predicted_answer}")

evaluate your VQA system's performance on these videos using the available ground truth question-answer pairs. Here's how you can set up the evaluation

In [None]:
# Function to get the predicted answer for a video
def predict_answer(video_features):
    # Replace this with your model prediction code
    return vqa_model(video_features)

In [None]:
# Initialize a dictionary to store the aggregated features for each video
video_features_new = {}

# Loop through each video in the all_video_graphs dictionary
for video_id, video_data in all_video_graphs.items():
    # Extract the adjacency and feature matrices for the current video
    adj_matrix = video_data['adjacency_matrix']
    feat_matrix = video_data['feature_matrix']
        if adj_matrix.sum() == 0 or feat_matrix.shape[0] == 0:
        print(f"Skipping video {video_id} due to missing edges or features.")
        continue

    # Convert adjacency matrix to edge list
    edge_index = torch.tensor(adj_matrix.nonzero(), dtype=torch.long)

    # Convert feature matrix to PyTorch tensor
    x = torch.tensor(feat_matrix, dtype=torch.float)
    # Create PyG graph object
    graph_data = Data(x=x, edge_index=edge_index)
    
    # Pass the graph data through the STGNN model
        # Process graph through STGNN
    with torch.no_grad():
        video_feature = stgnn(graph_data).unsqueeze(0)  # Shape: (1, 256)
    
    # Store the resulting aggregated features for each video in the dictionary
    video_features_new[video_id] = video_feature


In [None]:
video_features_new['bQJQGoJF7_k_162_169']

In [None]:
# Function to calculate Exact Match score
def exact_match(predicted_answer, ground_truth_answer):
    return predicted_answer.strip().lower() == ground_truth_answer.strip().lower()

# Calculate Exact Match for all examples
exact_match_count = 0
total_count = len(df)

for idx, row in df.iterrows():
    video_id = row['video_id']
    question = row['question']
    ground_truth_answer = row['answer']
    print(video_id)
    # Get the predicted answer (using the model)
    predicted_answer = predict_answer(video_features_new[video_id])

    # Check if the prediction is an exact match
    if exact_match(predicted_answer, ground_truth_answer):
        exact_match_count += 1

# Calculate Exact Match (EM) score
exact_match_score = exact_match_count / total_count
print(f"Exact Match Score: {exact_match_score * 100:.2f}%")