In [15]:
# Phase 1: Load and Tokenize Questions
import nltk
import json
import pickle

# Download NLTK punkt_tab (run once)
nltk.download('punkt_tab')

# Load questions directly
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Train_mscoco\v2_OpenEnded_mscoco_train2014_questions.json',
          'r') as f:
    questions = json.load(f)['questions']

# Tokenize with NLTK
tokenized_questions = []
for q in questions:
    tokens = [word.lower() for word in nltk.word_tokenize(q['question'])]
    tokenized_questions.append(tokens)

# Save to file
with open('tokenized_questions.pkl', 'wb') as f:
    pickle.dump(tokenized_questions, f)

print(f"Loaded and tokenized {len(tokenized_questions)} questions")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\anmol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded and tokenized 443757 questions


In [16]:
# Phase 1 (Continued): Build Vocabulary and Convert to Sequences
import pickle
from collections import Counter

# Load tokenized questions
with open('tokenized_questions.pkl', 'rb') as f:
    tokenized_questions = pickle.load(f)

# Build vocabulary (top 10,000 words)
vocab = Counter()
for tokens in tokenized_questions:
    vocab.update(tokens)

# Keep top 10,000 words
vocab = {word: idx + 1 for idx, (word, _) in enumerate(vocab.most_common(10000))}
vocab['<UNK>'] = len(vocab) + 1  # Unknown token
vocab['<PAD>'] = 0  # Padding token

# Save vocabulary
with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

# Convert questions to sequences
max_len = 14  # Max question length
question_sequences = []
for tokens in tokenized_questions:
    seq = [vocab.get(token, vocab['<UNK>']) for token in tokens[:max_len]]
    seq = seq + [0] * (max_len - len(seq))  # Pad with zeros
    question_sequences.append(seq)

# Save sequences
with open('question_sequences.pkl', 'wb') as f:
    pickle.dump(question_sequences, f)

print(f"Created vocabulary with {len(vocab)} words")
print(f"Converted {len(question_sequences)} questions to sequences")

Created vocabulary with 10002 words
Converted 443757 questions to sequences


In [30]:
# Inspect Tokenized Questions
import pickle

# Load tokenized questions
with open('tokenized_questions.pkl', 'rb') as f:
    tokenized_questions = pickle.load(f)

# Print first 5 questions
print("First 5 tokenized questions:")
for i, tokens in enumerate(tokenized_questions[:5], 1):
    print(f"Question: {tokens}")

First 5 tokenized questions:
Question: ['what', 'is', 'this', 'photo', 'taken', 'looking', 'through', '?']
Question: ['what', 'position', 'is', 'this', 'man', 'playing', '?']
Question: ['what', 'color', 'is', 'the', 'players', 'shirt', '?']
Question: ['is', 'this', 'man', 'a', 'professional', 'baseball', 'player', '?']
Question: ['what', 'color', 'is', 'the', 'snow', '?']


In [23]:
# Phase 2: Preprocess Answers
import json
import pickle
from collections import Counter

# Load answers
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Annotations_Train_mscoco\v2_mscoco_train2014_annotations.json',
          'r') as f:
    annotations = json.load(f)['annotations']

# Get most common answer per question
answers = []
for ann in annotations:
    ans_counts = Counter(a['answer'].lower() for a in ann['answers'])
    most_common = ans_counts.most_common(1)[0][0]  # Pick the top answer
    answers.append(most_common)

# Build answer vocabulary (top 3,000)
answer_vocab = {ans: idx for idx, (ans, _) in enumerate(Counter(answers).most_common(3000))}

# Save answer vocabulary
with open('answer_vocab.pkl', 'wb') as f:
    pickle.dump(answer_vocab, f)

print(f"Created answer vocabulary with {len(answer_vocab)} answers")

Created answer vocabulary with 3000 answers


In [24]:
import pickle

answer_vocab = pickle.load(open('answer_vocab.pkl', 'rb'))
print(list(answer_vocab.items())[:5])  # First 5 answers

[('no', 0), ('yes', 1), ('2', 2), ('1', 3), ('white', 4)]


In [1]:
import os
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import pickle
import json
from tqdm import tqdm
import time

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load question data
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Train_mscoco\v2_OpenEnded_mscoco_train2014_questions.json',
          'r') as f:
    questions = json.load(f)['questions']

# Get image IDs
image_ids = {q['image_id']: q['question_id'] for q in questions}

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load ResNet model
resnet = models.resnet50(pretrained=True).to(device)
resnet.eval()
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])

# Image directory
image_dir = r'D:\Projects\asfdgfhjghk\dataset\train2014\train2014'
image_features = {}
batch_size = 64

# Verify directory
if not os.path.exists(image_dir):
    print(f"Error: Directory not found: {image_dir}")
    exit(1)
else:
    print(f"Directory found: {image_dir}")
    print(f"Number of files: {len(os.listdir(image_dir))}")

# Process images in batches
image_id_list = list(image_ids.keys())
start_time = time.time()
batch_images = []
batch_question_ids = []

for i, image_id in enumerate(tqdm(image_id_list, desc="Processing images")):
    image_filename = f"COCO_train2014_{str(image_id).zfill(12)}.jpg"
    image_path = os.path.join(image_dir, image_filename)

    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        continue

    try:
        image = Image.open(image_path).convert('RGB')
        image = transform(image)
        batch_images.append(image)
        batch_question_ids.append(image_ids[image_id])
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        continue

    if len(batch_images) == batch_size or i == len(image_id_list) - 1:
        if batch_images:
            try:
                images = torch.stack(batch_images).to(device)
                with torch.no_grad():
                    features = resnet(images)
                    features = features.squeeze(-1).squeeze(-1).cpu().numpy()
                for question_id, feature in zip(batch_question_ids, features):
                    image_features[question_id] = feature
                del images, features
                torch.cuda.empty_cache()
            except RuntimeError as e:
                print(f"GPU error: {e}")
        batch_images = []
        batch_question_ids = []

# Save features
with open('image_features.pkl', 'wb') as f:
    pickle.dump(image_features, f)

end_time = time.time()
print(f"Extracted features for {len(image_features)} images")
print(f"Total time: {end_time - start_time:.2f} seconds")

Using device: cuda




Directory found: D:\Projects\asfdgfhjghk\dataset\train2014\train2014
Number of files: 82783


Processing images: 100%|██████████| 82783/82783 [25:25<00:00, 54.28it/s] 


Extracted features for 82783 images
Total time: 1526.07 seconds


In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import pickle
import json


class VQADataset(Dataset):
    def __init__(self, image_features_file, question_sequences_file, answer_vocab_file, annotations_file):
        with open(image_features_file, 'rb') as f:
            self.image_features = pickle.load(f)
        with open(question_sequences_file, 'rb') as f:
            self.question_sequences = pickle.load(f)
        with open(answer_vocab_file, 'rb') as f:
            self.answer_vocab = pickle.load(f)
        with open(annotations_file, 'r') as f:
            self.annotations = json.load(f)['annotations']
        # Filter valid question IDs
        self.question_ids = [ann['question_id'] for ann in self.annotations
                             if ann['question_id'] in self.image_features]

    def __len__(self):
        return len(self.question_ids)

    def __getitem__(self, idx):
        question_id = self.question_ids[idx]
        image_feature = torch.tensor(self.image_features[question_id], dtype=torch.float32)
        question_seq = torch.tensor(self.question_sequences[idx], dtype=torch.long)
        answer = self.annotations[idx]['multiple_choice_answer'].lower()
        answer_id = self.answer_vocab.get(answer, -1)  # -1 for unknown answers
        return image_feature, question_seq, answer_id


# Example usage
if __name__ == "__main__":
    # File paths
    image_features_file = 'image_features.pkl'
    question_sequences_file = 'question_sequences.pkl'
    answer_vocab_file = 'answer_vocab.pkl'
    annotations_file = r'D:\Projects\asfdgfhjghk\dataset\v2_Annotations_Train_mscoco\v2_mscoco_train2014_annotations.json'

    # Create dataset
    dataset = VQADataset(image_features_file, question_sequences_file, answer_vocab_file, annotations_file)
    print(f"Dataset size: {len(dataset)}")

    # Create DataLoader
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True, pin_memory=True)

    # Test one batch
    for image_features, question_seqs, answer_ids in dataloader:
        print(f"Image features shape: {image_features.shape}")  # [batch_size, 2048]
        print(f"Question sequences shape: {question_seqs.shape}")  # [batch_size, max_len]
        print(f"Answer IDs shape: {answer_ids.shape}")  # [batch_size]
        break

Dataset size: 82783
Image features shape: torch.Size([64, 2048])
Question sequences shape: torch.Size([64, 14])
Answer IDs shape: torch.Size([64])


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pickle
from torch.nn import LSTM


# VQA Model
class VQAModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, num_answers=3000):
        super().__init__()
        # Question processing
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = LSTM(embed_dim, hidden_dim, batch_first=True, num_layers=2, dropout=0.3)

        # Image processing
        self.image_fc = nn.Linear(2048, hidden_dim)
        self.attention = nn.MultiheadAttention(hidden_dim, num_heads=8)

        # Fusion and output
        self.fusion_fc = nn.Linear(hidden_dim * 2, hidden_dim)
        self.output_fc = nn.Linear(hidden_dim, num_answers)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, images, questions):
        # Image features
        img_features = self.relu(self.image_fc(images))  # [batch_size, hidden_dim]

        # Question features
        embedded = self.embedding(questions)  # [batch_size, seq_len, embed_dim]
        lstm_out, _ = self.lstm(embedded)  # [batch_size, seq_len, hidden_dim]

        # Attention: image features as query, question as key/value
        img_features = img_features.unsqueeze(0)  # [1, batch_size, hidden_dim]
        lstm_out = lstm_out.transpose(0, 1)  # [seq_len, batch_size, hidden_dim]
        attn_out, _ = self.attention(img_features, lstm_out, lstm_out)  # [1, batch_size, hidden_dim]
        attn_out = attn_out.squeeze(0)  # [batch_size, hidden_dim]

        # Fusion
        combined = torch.cat((attn_out, lstm_out[-1]), dim=1)  # [batch_size, hidden_dim*2]
        fused = self.relu(self.fusion_fc(combined))
        fused = self.dropout(fused)
        output = self.output_fc(fused)
        return output

In [4]:
def train_model(model, dataloader, num_epochs=150, device='cuda', patience=3):
    criterion = nn.CrossEntropyLoss(ignore_index=-1)  # Ignore unknown answers
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    model.to(device)

    best_loss = float('inf')
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for images, questions, answers in dataloader:
            images, questions, answers = images.to(device), questions.to(device), answers.to(device)
            optimizer.zero_grad()
            outputs = model(images, questions)
            loss = criterion(outputs, answers)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == answers).sum().item()
            total += (answers != -1).sum().item()

        avg_loss = total_loss / len(dataloader)
        accuracy = correct / total * 100

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

        # Early stopping check
        if avg_loss < best_loss:
            best_loss = avg_loss
            epochs_without_improvement = 0
            # Optionally save the best model
            torch.save(model.state_dict(), 'best_vqa_model.pth')
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print(f"Early stopping triggered after {epoch + 1} epochs.")
                break


In [5]:
# Main script
if __name__ == "__main__":
    # File paths
    image_features_file = 'image_features.pkl'
    question_sequences_file = 'question_sequences.pkl'
    answer_vocab_file = 'answer_vocab.pkl'
    annotations_file = r'D:\Projects\asfdgfhjghk\dataset\v2_Annotations_Train_mscoco\v2_mscoco_train2014_annotations.json'

    # Load vocab to get vocab_size
    with open(question_sequences_file, 'rb') as f:
        question_sequences = pickle.load(f)
    with open(answer_vocab_file, 'rb') as f:
        answer_vocab = pickle.load(f)
    vocab_size = max([max(seq) for seq in question_sequences]) + 1  # Max index + 1

    # Create dataset and dataloader
    dataset = VQADataset(image_features_file, question_sequences_file, answer_vocab_file, annotations_file)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True, pin_memory=True)

    # Initialize model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = VQAModel(vocab_size, num_answers=len(answer_vocab))

    # Train
    train_model(model, dataloader, num_epochs=150, device=device, patience=3)

    # Save model
    torch.save(model.state_dict(), 'vqa_model.pth')
    print("Model saved to vqa_model.pth")

Epoch 1/50, Loss: 3.6116, Accuracy: 24.96%
Epoch 2/50, Loss: 3.1917, Accuracy: 26.84%
Epoch 3/50, Loss: 3.0127, Accuracy: 27.73%
Epoch 4/50, Loss: 2.8791, Accuracy: 28.11%
Epoch 5/50, Loss: 2.7654, Accuracy: 28.76%
Epoch 6/50, Loss: 2.6578, Accuracy: 29.42%
Epoch 7/50, Loss: 2.5656, Accuracy: 30.49%
Epoch 8/50, Loss: 2.4811, Accuracy: 31.26%
Epoch 9/50, Loss: 2.3993, Accuracy: 32.12%
Epoch 10/50, Loss: 2.3298, Accuracy: 33.76%
Epoch 11/50, Loss: 2.2637, Accuracy: 35.10%
Epoch 12/50, Loss: 2.1969, Accuracy: 36.96%
Epoch 13/50, Loss: 2.1368, Accuracy: 38.67%
Epoch 14/50, Loss: 2.0751, Accuracy: 40.34%
Epoch 15/50, Loss: 2.0168, Accuracy: 41.91%
Epoch 16/50, Loss: 1.9631, Accuracy: 43.53%
Epoch 17/50, Loss: 1.9077, Accuracy: 44.85%
Epoch 18/50, Loss: 1.8603, Accuracy: 46.02%
Epoch 19/50, Loss: 1.8175, Accuracy: 47.10%
Epoch 20/50, Loss: 1.7808, Accuracy: 48.35%
Epoch 21/50, Loss: 1.7374, Accuracy: 49.35%
Epoch 22/50, Loss: 1.7017, Accuracy: 50.15%
Epoch 23/50, Loss: 1.6741, Accuracy: 50.8