## Preprocessing of Question dataset

In [2]:
'''PREPROCESSING TRAINING DATASET OF QUESTIONS'''

import nltk
import json
import pickle
from collections import Counter

# Download tokenizer model once
nltk.download('punkt')

# --- Load training questions ---
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Train_mscoco\v2_OpenEnded_mscoco_train2014_questions.json', 'r') as f:
    questions = json.load(f)['questions']

# --- Tokenize training questions ---
tokenized_questions = []
for q in questions:
    tokens = [word.lower() for word in nltk.word_tokenize(q['question'])]
    tokenized_questions.append(tokens)

# --- Build vocabulary (top 10,000 words + special tokens) ---
word_counts = Counter(word for tokens in tokenized_questions for word in tokens)
most_common = word_counts.most_common(10000)

vocab = {word: idx + 1 for idx, (word, _) in enumerate(most_common)}  # Start idx from 1
vocab['<PAD>'] = 0
vocab['<UNK>'] = len(vocab)  # Unknown token at the end

# --- Convert training tokens to sequences and pad ---
max_len = 14
question_sequences = []
for tokens in tokenized_questions:
    seq = [vocab.get(token, vocab['<UNK>']) for token in tokens[:max_len]]
    seq += [vocab['<PAD>']] * (max_len - len(seq))
    question_sequences.append(seq)

# --- Save vocabulary and training sequences ---
with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

with open('train_question_sequences.pkl', 'wb') as f:
    pickle.dump(question_sequences, f)

print(f"Training data processed: {len(question_sequences)} questions")
print(f"Vocabulary size (including special tokens): {len(vocab)}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anmol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training data processed: 443757 questions
Vocabulary size (including special tokens): 10002


In [3]:
'''PREPROCESSING VALIDATION DATASET OF QUESTIONS'''
import nltk
import json
import pickle

nltk.download('punkt')

# --- Load validation questions ---
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Val_mscoco\v2_OpenEnded_mscoco_val2014_questions.json', 'r') as f:
    val_qs = json.load(f)['questions']

# --- Tokenize validation questions ---
val_tokens = [[tok.lower() for tok in nltk.word_tokenize(q['question'])] for q in val_qs]

# --- Load vocabulary saved from training ---
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

# --- Convert validation tokens to sequences and pad ---
max_len = 14
val_sequences = []
for tokens in val_tokens:
    seq = [vocab.get(token, vocab['<UNK>']) for token in tokens[:max_len]]
    seq += [vocab['<PAD>']] * (max_len - len(seq))
    val_sequences.append(seq)

# --- Save validation sequences ---
with open('val_question_sequences.pkl', 'wb') as f:
    pickle.dump(val_sequences, f)

print(f"Validation data processed: {len(val_sequences)} questions")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anmol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Validation data processed: 214354 questions


In [1]:
# Inspect Tokenized Questions
import pickle

# Load tokenized questions
with open('val_question_sequences.pkl', 'rb') as f:
    tokenized_questions = pickle.load(f)

# Print first 5 questions
print("First 5 tokenized questions:")
for i, tokens in enumerate(tokenized_questions[:5], 1):
    print(f"Question: {tokens}")

First 5 tokenized questions:
Question: [24, 3, 48, 100, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Question: [4, 5, 2, 17, 7, 2, 74, 33, 1, 0, 0, 0, 0, 0]
Question: [4, 3, 48, 8, 96, 9, 1, 0, 0, 0, 0, 0, 0, 0]
Question: [4, 1104, 3297, 2, 18, 1, 0, 0, 0, 0, 0, 0, 0, 0]
Question: [3, 6, 10, 10001, 1196, 1, 0, 0, 0, 0, 0, 0, 0, 0]


## Preprocessing of annotation dataset

In [4]:
'''PREPROCESSING TRAINING DATASET OF ANNOTATIONS'''
import json
import pickle
from collections import Counter

# Load answers
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Annotations_Train_mscoco\v2_mscoco_train2014_annotations.json',
          'r') as f:
    annotations = json.load(f)['annotations']

# Get most common answer per question
answers = []
for ann in annotations:
    ans_counts = Counter(a['answer'].lower() for a in ann['answers'])
    most_common = ans_counts.most_common(1)[0][0]  # Pick the top answer
    answers.append(most_common)

# Build answer vocabulary (top 3,000)
answer_vocab = {ans: idx for idx, (ans, _) in enumerate(Counter(answers).most_common(3000))}

# Save answer vocabulary
with open('answer_vocab.pkl', 'wb') as f:
    pickle.dump(answer_vocab, f)

print(f"Created answer vocabulary with {len(answer_vocab)} answers")

Created answer vocabulary with 3000 answers


In [5]:
'''PREPROCESSING VALIDATION DATASET OF ANNOTATIONS'''

import json
import pickle
from collections import Counter

# Load validation annotations
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Annotations_Val_mscoco\v2_mscoco_val2014_annotations.json', 'r') as f:
    val_annotations = json.load(f)['annotations']

# Load answer vocabulary built from training
with open('answer_vocab.pkl', 'rb') as f:
    answer_vocab = pickle.load(f)

val_answers = []
for ann in val_annotations:
    ans_counts = Counter(a['answer'].lower() for a in ann['answers'])
    most_common = ans_counts.most_common(1)[0][0]
    val_answers.append(most_common)

# Convert validation answers to indices (use -1 or some value for unknown answers)
val_answer_indices = [answer_vocab.get(ans, -1) for ans in val_answers]

# Save validation answer indices
with open('val_answer_indices.pkl', 'wb') as f:
    pickle.dump(val_answer_indices, f)

print(f"Processed {len(val_answer_indices)} validation answers")

Processed 214354 validation answers


## Preprocessing of image dataset

In [None]:
'''PREPROCESSING TRAINING DATASET OF IMAGES'''

import os
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import pickle
import json
from tqdm import tqdm
import time

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load question data
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Train_mscoco\v2_OpenEnded_mscoco_train2014_questions.json',
          'r') as f:
    questions = json.load(f)['questions']

# Get image IDs
image_ids = {q['image_id']: q['question_id'] for q in questions}

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load ResNet model
resnet = models.resnet50(pretrained=True).to(device)
resnet.eval()
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])

# Image directory
image_dir = r'D:\Projects\asfdgfhjghk\dataset\train2014\train2014'
image_features = {}
batch_size = 64

# Verify directory
if not os.path.exists(image_dir):
    print(f"Error: Directory not found: {image_dir}")
    exit(1)
else:
    print(f"Directory found: {image_dir}")
    print(f"Number of files: {len(os.listdir(image_dir))}")

# Process images in batches
image_id_list = list(image_ids.keys())
start_time = time.time()
batch_images = []
batch_question_ids = []

for i, image_id in enumerate(tqdm(image_id_list, desc="Processing images")):
    image_filename = f"COCO_train2014_{str(image_id).zfill(12)}.jpg"
    image_path = os.path.join(image_dir, image_filename)

    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        continue

    try:
        image = Image.open(image_path).convert('RGB')
        image = transform(image)
        batch_images.append(image)
        batch_question_ids.append(image_ids[image_id])
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        continue

    if len(batch_images) == batch_size or i == len(image_id_list) - 1:
        if batch_images:
            try:
                images = torch.stack(batch_images).to(device)
                with torch.no_grad():
                    features = resnet(images)
                    features = features.squeeze(-1).squeeze(-1).cpu().numpy()
                for question_id, feature in zip(batch_question_ids, features):
                    image_features[question_id] = feature
                del images, features
                torch.cuda.empty_cache()
            except RuntimeError as e:
                print(f"GPU error: {e}")
        batch_images = []
        batch_question_ids = []

# Save features
with open('image_features.pkl', 'wb') as f:
    pickle.dump(image_features, f)

end_time = time.time()
print(f"Extracted features for {len(image_features)} images")
print(f"Total time: {end_time - start_time:.2f} seconds")

In [6]:
'''PREPROCESSING VALIDATION DATASET OF IMAGES'''

import os
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import pickle
import json
from tqdm import tqdm
import time

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load question data
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Val_mscoco\v2_OpenEnded_mscoco_val2014_questions.json',
          'r') as f:
    questions = json.load(f)['questions']

# Get image IDs
image_ids = {q['image_id']: q['question_id'] for q in questions}

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load ResNet model
resnet = models.resnet50(pretrained=True).to(device)
resnet.eval()
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])

# Image directory
image_dir = r'D:\Projects\asfdgfhjghk\dataset\val2014\val2014'
image_features = {}
batch_size = 64

# Verify directory
if not os.path.exists(image_dir):
    print(f"Error: Directory not found: {image_dir}")
    exit(1)
else:
    print(f"Directory found: {image_dir}")
    print(f"Number of files: {len(os.listdir(image_dir))}")

# Process images in batches
image_id_list = list(image_ids.keys())
start_time = time.time()
batch_images = []
batch_question_ids = []

for i, image_id in enumerate(tqdm(image_id_list, desc="Processing images")):
    image_filename = f"COCO_val2014_{str(image_id).zfill(12)}.jpg"
    image_path = os.path.join(image_dir, image_filename)

    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        continue

    try:
        image = Image.open(image_path).convert('RGB')
        image = transform(image)
        batch_images.append(image)
        batch_question_ids.append(image_ids[image_id])
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        continue

    if len(batch_images) == batch_size or i == len(image_id_list) - 1:
        if batch_images:
            try:
                images = torch.stack(batch_images).to(device)
                with torch.no_grad():
                    features = resnet(images)
                    features = features.squeeze(-1).squeeze(-1).cpu().numpy()
                for question_id, feature in zip(batch_question_ids, features):
                    image_features[question_id] = feature
                del images, features
                torch.cuda.empty_cache()
            except RuntimeError as e:
                print(f"GPU error: {e}")
        batch_images = []
        batch_question_ids = []

# Save features
with open('image_features.pkl', 'wb') as f:
    pickle.dump(image_features, f)

end_time = time.time()
print(f"Extracted features for {len(image_features)} images")
print(f"Total time: {end_time - start_time:.2f} seconds")

Using device: cuda




Directory found: D:\Projects\asfdgfhjghk\dataset\val2014\val2014
Number of files: 40504


Processing images: 100%|██████████| 40504/40504 [12:10<00:00, 55.48it/s]


Extracted features for 40504 images
Total time: 730.49 seconds


In [10]:
import torch
from torch.utils.data import Dataset
import pickle
import json

class VQADataset(Dataset):
    def __init__(self, image_features_file, train_question_sequences, answer_vocab_file, annotations_file):
        # Load image features dict: {question_id: feature}
        with open(image_features_file, 'rb') as f:
            self.image_features = pickle.load(f)

        # Load question sequences list aligned by question order
        with open(train_question_sequences, 'rb') as f:
            self.train_question_sequences = pickle.load(f)

        # Load answer vocab dict: {answer: idx}
        with open(answer_vocab_file, 'rb') as f:
            self.answer_vocab = pickle.load(f)

        # Load annotations list
        with open(annotations_file, 'r') as f:
            self.annotations = json.load(f)['annotations']

        # Build a mapping from question_id to index in question_sequences and annotations
        # because question_sequences is a list aligned with training questions
        self.qid_to_index = {}
        for idx, ann in enumerate(self.annotations):
            qid = ann['question_id']
            if qid in self.image_features:  # Only keep questions with image features
                self.qid_to_index[qid] = idx

        # Now create a list of valid question IDs that have all info
        self.question_ids = list(self.qid_to_index.keys())

    def __len__(self):
        return len(self.question_ids)

    def __getitem__(self, idx):
        qid = self.question_ids[idx]
        ann_idx = self.qid_to_index[qid]

        image_feature = torch.tensor(self.image_features[qid], dtype=torch.float32)

        # question_sequences is indexed by the order of training questions, use ann_idx
        question_seq = torch.tensor(self.question_sequences[ann_idx], dtype=torch.long)

        answer_text = self.annotations[ann_idx]['multiple_choice_answer'].lower()
        answer_id = self.answer_vocab.get(answer_text, -1)  # -1 for unknown answer

        return image_feature, question_seq, answer_id


# Example usage
if __name__ == "__main__":
    image_features_file = 'image_features.pkl'
    train_question_sequences = 'train_question_sequences.pkl'
    answer_vocab_file = 'answer_vocab.pkl'
    annotations_file = r'D:\Projects\asfdgfhjghk\dataset\v2_Annotations_Train_mscoco\v2_mscoco_train2014_annotations.json'

    dataset = VQADataset(image_features_file, train_question_sequences, answer_vocab_file, annotations_file)
    print(f"Dataset size: {len(dataset)}")

    from torch.utils.data import DataLoader
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True, pin_memory=True)

    for img_feat, q_seq, ans_id in dataloader:
        print(f"Image features shape: {img_feat.shape}")
        print(f"Question sequences shape: {q_seq.shape}")
        print(f"Answer IDs shape: {ans_id.shape}")
        break


Dataset size: 0


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class VQAModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=300, hidden_dim=512, num_answers=3000, num_layers=2, dropout=0.5):
        super().__init__()
        # Question embedding + encoding
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.question_encoder = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim // 2,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0.0
        )
        self.q_proj = nn.Linear(hidden_dim, hidden_dim)

        # Image feature projection
        self.img_proj = nn.Sequential(
            nn.Linear(2048, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        # Co-attention: question attends to image and image attends to question
        self.co_attn_q2i = nn.MultiheadAttention(hidden_dim, num_heads=8, dropout=dropout)
        self.co_attn_i2q = nn.MultiheadAttention(hidden_dim, num_heads=8, dropout=dropout)

        # Fusion and classification
        self.fusion = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        self.classifier = nn.Linear(hidden_dim, num_answers)

        # Initialization
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, (nn.Linear, nn.Embedding)):
                nn.init.xavier_uniform_(m.weight)
            elif isinstance(m, nn.LayerNorm):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.zeros_(m.bias)

    def forward(self, img_feats, ques_seqs, ques_mask=None):
        # Encode question
        embeds = self.embedding(ques_seqs)  # [B, L, E]
        q_enc, _ = self.question_encoder(embeds)  # [B, L, H]
        q_enc = self.q_proj(q_enc)  # project to hidden_dim

        # Project image
        img_proj = self.img_proj(img_feats)  # [B, H]
        img_proj = img_proj.unsqueeze(1)  # [B, 1, H]

        # Prepare for attention (transpose for MHA: [seq_len, B, H])
        q_enc_t = q_enc.transpose(0, 1)      # [L, B, H]
        img_t    = img_proj.transpose(0, 1)  # [1, B, H]

        # Co-attention
        # question queries image, then image queries question
        img_att, _ = self.co_attn_q2i(query=img_t, key=q_enc_t, value=q_enc_t, key_padding_mask=ques_mask)
        q_att, _   = self.co_attn_i2q(query=q_enc_t, key=img_t, value=img_t)

        # Pool attended features
        img_feat = img_att.squeeze(0)                   # [B, H]
        q_feat   = q_att[-1]  if q_att.size(0)>1 else q_att.squeeze(0)  # [B, H]

        # Fuse and classify
        fused = self.fusion(torch.cat([img_feat, q_feat], dim=1))  # [B, H]
        out   = self.classifier(fused)                            # [B, num_answers]
        return out


In [3]:
from torch.distributed import optim


def train_model(model, dataloader, num_epochs=150, device='cuda', patience=3):
    criterion = nn.CrossEntropyLoss(ignore_index=-1)  # Ignore unknown answers
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    model.to(device)

    best_loss = float('inf')
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for images, questions, answers in dataloader:
            images, questions, answers = images.to(device), questions.to(device), answers.to(device)
            optimizer.zero_grad()
            outputs = model(images, questions)
            loss = criterion(outputs, answers)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == answers).sum().item()
            total += (answers != -1).sum().item()

        avg_loss = total_loss / len(dataloader)
        accuracy = correct / total * 100

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

        # Early stopping check
        if avg_loss < best_loss:
            best_loss = avg_loss
            epochs_without_improvement = 0
            # Optionally save the best model
            torch.save(model.state_dict(), 'best_vqa_model.pth')
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered after {epoch + 1} epochs.")
                break


In [4]:
# Main script
if __name__ == "__main__":
    # File paths
    image_features_file = 'image_features.pkl'
    question_sequences_file = 'question_sequences.pkl'
    answer_vocab_file = 'answer_vocab.pkl'
    annotations_file = r'D:\Projects\asfdgfhjghk\dataset\v2_Annotations_Train_mscoco\v2_mscoco_train2014_annotations.json'

    # Load vocab to get vocab_size
    with open(question_sequences_file, 'rb') as f:
        question_sequences = pickle.load(f)
    with open(answer_vocab_file, 'rb') as f:
        answer_vocab = pickle.load(f)
    vocab_size = max([max(seq) for seq in question_sequences]) + 1  # Max index + 1

    # Create dataset and dataloader
    dataset = VQADataset(image_features_file, question_sequences_file, answer_vocab_file, annotations_file)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True, pin_memory=True)

    # Initialize model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = VQAModel(vocab_size, num_answers=len(answer_vocab))

    # Train
    train_model(model, dataloader, num_epochs=150, device=device, patience=3)

    # Save model
    torch.save(model.state_dict(), 'vqa_model.pth')
    print("Model saved to vqa_model.pth")

Epoch 1/150, Loss: 3.5782, Accuracy: 25.16%
Epoch 2/150, Loss: 3.1863, Accuracy: 26.92%
Epoch 3/150, Loss: 3.0041, Accuracy: 27.97%
Epoch 4/150, Loss: 2.8678, Accuracy: 28.43%
Epoch 5/150, Loss: 2.7597, Accuracy: 29.57%
Epoch 6/150, Loss: 2.6645, Accuracy: 30.16%
Epoch 7/150, Loss: 2.5767, Accuracy: 31.61%
Epoch 8/150, Loss: 2.4964, Accuracy: 33.22%
Epoch 9/150, Loss: 2.4365, Accuracy: 34.29%
Epoch 10/150, Loss: 2.3729, Accuracy: 36.00%
Epoch 11/150, Loss: 2.3092, Accuracy: 37.24%
Epoch 12/150, Loss: 2.2546, Accuracy: 38.55%
Epoch 13/150, Loss: 2.1972, Accuracy: 39.70%
Epoch 14/150, Loss: 2.1450, Accuracy: 40.90%
Epoch 15/150, Loss: 2.1015, Accuracy: 41.71%
Epoch 16/150, Loss: 2.0601, Accuracy: 42.74%
Epoch 17/150, Loss: 2.0120, Accuracy: 43.72%
Epoch 18/150, Loss: 1.9781, Accuracy: 44.45%
Epoch 19/150, Loss: 1.9382, Accuracy: 45.44%
Epoch 20/150, Loss: 1.9083, Accuracy: 45.92%
Epoch 21/150, Loss: 1.8776, Accuracy: 46.80%
Epoch 22/150, Loss: 1.8532, Accuracy: 47.15%
Epoch 23/150, Loss:

In [11]:
with open('image_features.pkl', 'rb') as f:
    img_feats = pickle.load(f)

print(f"Loaded {len(img_feats)} image features")
print("Sample keys (question IDs):", list(img_feats.keys())[:5])


Loaded 40504 image features
Sample keys (question IDs): [262148002, 393225003, 393226002, 240301002, 131089004]


In [12]:
with open(annotations_file, 'r') as f:
    anns = json.load(f)['annotations']

qids = [ann['question_id'] for ann in anns]
print(f"Loaded {len(qids)} annotations")
print("Sample question_ids:", qids[:5])


Loaded 443757 annotations
Sample question_ids: [458752000, 458752001, 458752002, 458752003, 262146000]


In [13]:
intersection = set(img_feats.keys()) & set(qids)
print(f"Matching question IDs: {len(intersection)}")

Matching question IDs: 0
