Setup + Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os
import json
import time
import random
from pathlib import Path
import numpy as np
import pandas as pd
from collections import defaultdict

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

Mounted at /content/drive


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [None]:
BASE = Path("/content/drive/MyDrive/Data")
FEATURES_DIR = BASE / "features"
TEXT_DIR = BASE / "text_features_bert"
MODEL_DIR = BASE / "models"
MODEL_DIR.mkdir(exist_ok=True)

print("FEATURES_DIR exists:", FEATURES_DIR.exists())
print("TEXT_DIR exists:", TEXT_DIR.exists())

FEATURES_DIR exists: True
TEXT_DIR exists: True


Load Spatial Features (ResNet-50 7Ã—7)

In [None]:
spatial_path = FEATURES_DIR / "features_spatial.pt"

if not spatial_path.exists():
    raise FileNotFoundError("features_spatial.pt not found in features folder")

features_spatial = torch.load(str(spatial_path), map_location="cpu")

print("Loaded spatial features:", len(features_spatial))
sample_key = list(features_spatial.keys())[0]
print("Example tensor shape:", features_spatial[sample_key].shape)  # (2048,7,7)

Loaded spatial features: 1449
Example tensor shape: torch.Size([2048, 7, 7])


Load Text Features (BERT pooled)

In [None]:
q_all = np.load(str(TEXT_DIR / "q_bert_pooled.npy"))
ans_all = np.load(str(TEXT_DIR / "answer_idx.npy"))

print("q_all shape:", q_all.shape)
print("ans_all shape:", ans_all.shape)

answer2idx_path = TEXT_DIR / "answer2idx.json"
answer2idx = json.load(open(answer2idx_path)) if answer2idx_path.exists() else None


q_all shape: (12468, 768)
ans_all shape: (12468,)


Align With Train/Eval CSV

In [None]:
train_df = pd.read_csv(BASE / "data_train.csv")
eval_df = pd.read_csv(BASE / "data_eval.csv")
combined_df = pd.read_csv(BASE / "data.csv")

In [None]:
def norm(x): return str(x).strip()

In [None]:
keys = [(norm(r['question']), norm(r['answer']), norm(r['image_id']))
        for _, r in combined_df.iterrows()]

mapping = defaultdict(list)
for i,k in enumerate(keys):
    mapping[k].append(i)

In [None]:
def map_df(df):
    idxs=[]
    for _,r in df.iterrows():
        k=(norm(r['question']), norm(r['answer']), norm(r['image_id']))
        if mapping[k]:
            idxs.append(mapping[k].pop(0))
        else:
            idxs.append(None)
    return idxs

In [None]:
train_idx = map_df(train_df)
eval_idx  = map_df(eval_df)

In [None]:
def build_subset(idxs):
    qs=[]; labs=[]
    for i in idxs:
        if i is None:
            qs.append(np.zeros(q_all.shape[1], dtype=np.float32))
            labs.append(-1)
        else:
            qs.append(q_all[i])
            labs.append(int(ans_all[i]))
    return np.stack(qs), np.array(labs)

In [None]:
q_train, ans_train = build_subset(train_idx)
q_eval, ans_eval   = build_subset(eval_idx)

print("Train:", q_train.shape, ans_train.shape)
print("Eval :", q_eval.shape, ans_eval.shape)

Train: (6795, 768) (6795,)
Eval : (5673, 768) (5673,)


Dataset (Spatial + Question)

In [None]:
class VQAAttentionDataset(Dataset):
    def __init__(self, df, q_feats, labels):
        self.df = df.reset_index(drop=True)
        self.q = q_feats
        self.labels = labels

    def __len__(self):
        return len(self.q)

    def __getitem__(self, idx):
        img_id = str(self.df.loc[idx, "image_id"]).strip()

        if img_id in features_spatial:
            img_feat = features_spatial[img_id]  # (2048,7,7)
        else:
            img_feat = torch.zeros(2048,7,7)

        return (
            img_feat.float(),
            torch.tensor(self.q[idx], dtype=torch.float32),
            torch.tensor(self.labels[idx], dtype=torch.long)
        )

train_dataset = VQAAttentionDataset(train_df, q_train, ans_train)
eval_dataset  = VQAAttentionDataset(eval_df,  q_eval,  ans_eval)


Attention VQA Model

In [None]:
max_label = max(ans_train.max(), ans_eval.max())
num_classes = int(max_label) + 1
print("Number of classes:", num_classes)

class AttentionVQA(nn.Module):
    def __init__(self, img_dim=2048, txt_dim=768, hidden=512, num_classes=583):
        super().__init__()

        self.img_proj = nn.Linear(img_dim, hidden)
        self.txt_proj = nn.Linear(txt_dim, hidden)

        self.attention = nn.Linear(hidden, 1)

        self.classifier = nn.Sequential(
            nn.Linear(hidden*2, hidden),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden, num_classes)
        )

    def forward(self, img, txt):
        B = img.size(0)

        img = img.view(B, 2048, -1).permute(0,2,1)  # (B,49,2048)
        img_feat = self.img_proj(img)              # (B,49,512)

        txt_feat = self.txt_proj(txt).unsqueeze(1) # (B,1,512)

        joint = torch.tanh(img_feat + txt_feat)

        attn_scores = self.attention(joint).squeeze(-1)
        attn_weights = torch.softmax(attn_scores, dim=1)

        attended = torch.sum(img_feat * attn_weights.unsqueeze(-1), dim=1)

        fused = torch.cat([attended, txt_feat.squeeze(1)], dim=1)

        return self.classifier(fused)

model = AttentionVQA(
    img_dim=2048,
    txt_dim=768,
    hidden=512,
    num_classes=num_classes
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=-1)

print("Model ready.")


Number of classes: 583
Model ready.


Training Loop

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
eval_loader  = DataLoader(eval_dataset, batch_size=32)

epochs = 12
best_val = 0

for ep in range(epochs):
    model.train()
    total_loss=0

    for img,q,l in train_loader:
        img,q,l = img.to(device), q.to(device), l.to(device)

        optimizer.zero_grad()
        out = model(img,q)
        loss = criterion(out,l)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    model.eval()
    correct=0; total=0

    with torch.no_grad():
        for img,q,l in eval_loader:
            img,q,l = img.to(device), q.to(device), l.to(device)
            out = model(img,q)
            preds = out.argmax(1)

            mask = l!=-1
            correct += (preds[mask]==l[mask]).sum().item()
            total += mask.sum().item()

    acc = correct/total
    print(f"Epoch {ep+1} Loss {total_loss:.2f} Val Acc {acc:.4f}")

    if acc>best_val:
        best_val=acc
        torch.save(model.state_dict(), MODEL_DIR/"attention_best.pth")

print("Training complete.")


Epoch 1 Loss 1081.63 Val Acc 0.1075
Epoch 2 Loss 988.94 Val Acc 0.1206
Epoch 3 Loss 935.38 Val Acc 0.1248
Epoch 4 Loss 879.83 Val Acc 0.1585
Epoch 5 Loss 835.65 Val Acc 0.1585
Epoch 6 Loss 798.08 Val Acc 0.1666
Epoch 7 Loss 761.31 Val Acc 0.1641
Epoch 8 Loss 728.27 Val Acc 0.1756
Epoch 9 Loss 694.29 Val Acc 0.1868
Epoch 10 Loss 663.15 Val Acc 0.1890
Epoch 11 Loss 632.96 Val Acc 0.1831
Epoch 12 Loss 606.27 Val Acc 0.1868
Training complete.
