## Preprocessing of Question dataset

In [None]:
import nltk
import json
import pickle
from collections import Counter

# Download tokenizer model once
nltk.download('punkt')

# --- Load training questions ---
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Train_mscoco\v2_OpenEnded_mscoco_train2014_questions.json', 'r') as f:
    questions = json.load(f)['questions']

# --- Tokenize training questions ---
tokenized_questions = []
for q in questions:
    tokens = [word.lower() for word in nltk.word_tokenize(q['question'])]
    tokenized_questions.append(tokens)

# --- Build vocabulary (top 9,999 words + <PAD> and <UNK>) ---
word_counts = Counter(word for tokens in tokenized_questions for word in tokens)
most_common = word_counts.most_common(9999)  # Reserve 1 slot for <UNK>
vocab = {word: idx + 1 for idx, (word, _) in enumerate(most_common)}  # Start idx from 1
vocab['<PAD>'] = 0
vocab['<UNK>'] = 9999  # Use 9999 instead of len(vocab)

# --- Verify vocabulary size ---
print(f"Vocabulary size: {len(vocab)}")  # Should be 10000 (9999 words + <PAD> + <UNK>)

# --- Convert training tokens to sequences and pad ---
max_len = 14
question_sequences = []
for tokens in tokenized_questions:
    seq = [vocab.get(token, vocab['<UNK>']) for token in tokens[:max_len]]
    seq += [vocab['<PAD>']] * (max_len - len(seq))
    question_sequences.append(seq)

# --- Save vocabulary and training sequences ---
with open('PKL/vocabs/vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

with open('PKL/q files/train_question_sequences.pkl', 'wb') as f:
    pickle.dump(question_sequences, f)

print(f"Training data processed: {len(question_sequences)} questions")
print(f"Vocabulary size (including special tokens): {len(vocab)}")

# --- Validation preprocessing ---
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Val_mscoco\v2_OpenEnded_mscoco_val2014_questions.json', 'r') as f:
    val_qs = json.load(f)['questions']

# --- Tokenize validation questions ---
val_tokens = [[tok.lower() for tok in nltk.word_tokenize(q['question'])] for q in val_qs]

# --- Load vocabulary ---
with open('PKL/vocabs/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

# --- Convert validation tokens to sequences and pad ---
max_len = 14
val_sequences = []
for tokens in val_tokens:
    seq = [vocab.get(token, vocab['<UNK>']) for token in tokens[:max_len]]
    seq += [vocab['<PAD>']] * (max_len - len(seq))
    val_sequences.append(seq)

# --- Save validation sequences ---
with open('PKL/q files/val_question_sequences.pkl', 'wb') as f:
    pickle.dump(val_sequences, f)

print(f"Validation data processed: {len(val_sequences)} questions")

In [2]:
'''PREPROCESSING TRAINING DATASET OF QUESTIONS'''

import nltk
import json
import pickle
from collections import Counter

# Download tokenizer model once
nltk.download('punkt')

# --- Load training questions ---
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Train_mscoco\v2_OpenEnded_mscoco_train2014_questions.json',
          'r') as f:
    questions = json.load(f)['questions']

# --- Tokenize training questions ---
tokenized_questions = []
for q in questions:
    tokens = [word.lower() for word in nltk.word_tokenize(q['question'])]
    tokenized_questions.append(tokens)

# --- Build vocabulary (top 10,000 words + special tokens) ---
word_counts = Counter(word for tokens in tokenized_questions for word in tokens)
most_common = word_counts.most_common(10000)

vocab = {word: idx + 1 for idx, (word, _) in enumerate(most_common)}  # Start idx from 1
vocab['<PAD>'] = 0
vocab['<UNK>'] = len(vocab)  # Unknown token at the end

# --- Convert training tokens to sequences and pad ---
max_len = 14
question_sequences = []
for tokens in tokenized_questions:
    seq = [vocab.get(token, vocab['<UNK>']) for token in tokens[:max_len]]
    seq += [vocab['<PAD>']] * (max_len - len(seq))
    question_sequences.append(seq)

# --- Save vocabulary and training sequences ---
with open('PKL/vocabs/vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

with open('PKL/q files/train_question_sequences.pkl', 'wb') as f:
    pickle.dump(question_sequences, f)

print(f"Training data processed: {len(question_sequences)} questions")
print(f"Vocabulary size (including special tokens): {len(vocab)}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anmol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training data processed: 443757 questions
Vocabulary size (including special tokens): 10002


In [3]:
'''PREPROCESSING VALIDATION DATASET OF QUESTIONS'''
import nltk
import json
import pickle

nltk.download('punkt')

# --- Load validation questions ---
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Val_mscoco\v2_OpenEnded_mscoco_val2014_questions.json',
          'r') as f:
    val_qs = json.load(f)['questions']

# --- Tokenize validation questions ---
val_tokens = [[tok.lower() for tok in nltk.word_tokenize(q['question'])] for q in val_qs]

# --- Load vocabulary saved from training ---
with open('PKL/vocabs/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

# --- Convert validation tokens to sequences and pad ---
max_len = 14
val_sequences = []
for tokens in val_tokens:
    seq = [vocab.get(token, vocab['<UNK>']) for token in tokens[:max_len]]
    seq += [vocab['<PAD>']] * (max_len - len(seq))
    val_sequences.append(seq)

# --- Save validation sequences ---
with open('PKL/q files/val_question_sequences.pkl', 'wb') as f:
    pickle.dump(val_sequences, f)

print(f"Validation data processed: {len(val_sequences)} questions")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anmol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Validation data processed: 214354 questions


In [1]:
# Inspect Tokenized Questions
import pickle

# Load tokenized questions
with open('PKL/q files/val_question_sequences.pkl', 'rb') as f:
    tokenized_questions = pickle.load(f)

# Print first 5 questions
print("First 5 tokenized questions:")
for i, tokens in enumerate(tokenized_questions[:5], 1):
    print(f"Question: {tokens}")

First 5 tokenized questions:
Question: [24, 3, 48, 100, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Question: [4, 5, 2, 17, 7, 2, 74, 33, 1, 0, 0, 0, 0, 0]
Question: [4, 3, 48, 8, 96, 9, 1, 0, 0, 0, 0, 0, 0, 0]
Question: [4, 1104, 3297, 2, 18, 1, 0, 0, 0, 0, 0, 0, 0, 0]
Question: [3, 6, 10, 10001, 1196, 1, 0, 0, 0, 0, 0, 0, 0, 0]


## Preprocessing of annotation dataset

In [12]:
'''PREPROCESSING TRAINING DATASET OF ANNOTATIONS'''
import json
import pickle
from collections import Counter

# Load answers
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Annotations_Train_mscoco\v2_mscoco_train2014_annotations.json',
          'r') as f:
    annotations = json.load(f)['annotations']

# Get most common answer per question
answers = []
for ann in annotations:
    ans_counts = Counter(a['answer'].lower() for a in ann['answers'])
    most_common = ans_counts.most_common(1)[0][0]  # Pick the top answer
    answers.append(most_common)

# Build answer vocabulary (top 3,000)
answer_vocab = {ans: idx for idx, (ans, _) in enumerate(Counter(answers).most_common(3000))}

# Save answer vocabulary
with open('PKL/vocabs/answer_vocab.pkl', 'wb') as f:
    pickle.dump(answer_vocab, f)

print(f"Created answer vocabulary with {len(answer_vocab)} answers")
# Convert training answers to indices using the answer vocabulary
train_answer_indices = [answer_vocab.get(ans, -1) for ans in answers]

# Save training answer indices
with open('PKL/val ans/train_answer_indices.pkl', 'wb') as f:
    pickle.dump(train_answer_indices, f)

print(f"Saved {len(train_answer_indices)} training answer indices")


Created answer vocabulary with 3000 answers
Saved 443757 training answer indices


In [5]:
'''PREPROCESSING VALIDATION DATASET OF ANNOTATIONS'''

import json
import pickle
from collections import Counter

# Load validation annotations
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Annotations_Val_mscoco\v2_mscoco_val2014_annotations.json', 'r') as f:
    val_annotations = json.load(f)['annotations']

# Load answer vocabulary built from training
with open('PKL/vocabs/answer_vocab.pkl', 'rb') as f:
    answer_vocab = pickle.load(f)

val_answers = []
for ann in val_annotations:
    ans_counts = Counter(a['answer'].lower() for a in ann['answers'])
    most_common = ans_counts.most_common(1)[0][0]
    val_answers.append(most_common)

# Convert validation answers to indices (use -1 or some value for unknown answers)
val_answer_indices = [answer_vocab.get(ans, -1) for ans in val_answers]

# Save validation answer indices
with open('PKL/val ans/val_answer_indices.pkl', 'wb') as f:
    pickle.dump(val_answer_indices, f)

print(f"Processed {len(val_answer_indices)} validation answers")

Processed 214354 validation answers


## Preprocessing of image dataset

In [1]:
import os
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import pickle
import json
from tqdm import tqdm
import time

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load question data
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Train_mscoco\v2_OpenEnded_mscoco_train2014_questions.json',
          'r') as f:
    questions = json.load(f)['questions']

# Get unique image IDs (as a list)
image_ids = list({q['image_id'] for q in questions})

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load ResNet model (remove final classification layer)
resnet = models.resnet50(pretrained=True).to(device)
resnet.eval()
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])

# Image directory
image_dir = r'D:\Projects\asfdgfhjghk\dataset\train2014\train2014'
image_features = {}
batch_size = 64

# Verify directory
if not os.path.exists(image_dir):
    print(f"Error: Directory not found: {image_dir}")
    exit(1)
else:
    print(f"Directory found: {image_dir}")
    print(f"Number of files: {len(os.listdir(image_dir))}")

# Process images in batches
start_time = time.time()
batch_images = []
batch_image_ids = []

for i, image_id in enumerate(tqdm(image_ids, desc="Processing train images")):
    image_filename = f"COCO_train2014_{str(image_id).zfill(12)}.jpg"
    image_path = os.path.join(image_dir, image_filename)

    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        continue

    try:
        image = Image.open(image_path).convert('RGB')
        image = transform(image)
        batch_images.append(image)
        batch_image_ids.append(image_id)
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        continue

    if len(batch_images) == batch_size or i == len(image_ids) - 1:
        if batch_images:
            try:
                images = torch.stack(batch_images).to(device)
                with torch.no_grad():
                    features = resnet(images)
                    features = features.squeeze(-1).squeeze(-1).cpu().numpy()
                for img_id, feature in zip(batch_image_ids, features):
                    image_features[img_id] = feature
                del images, features
                torch.cuda.empty_cache()
            except RuntimeError as e:
                print(f"GPU error: {e}")
        batch_images = []
        batch_image_ids = []

# Save features
with open('train_image_features.pkl', 'wb') as f:
    pickle.dump(image_features, f)

end_time = time.time()
print(f"Extracted features for {len(image_features)} train images")
print(f"Total time: {end_time - start_time:.2f} seconds")


Using device: cuda




Directory found: D:\Projects\asfdgfhjghk\dataset\train2014\train2014
Number of files: 82783


Processing train images: 100%|██████████| 82783/82783 [22:56<00:00, 60.13it/s]  


Extracted features for 82783 train images
Total time: 1377.78 seconds


In [2]:
import os
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import pickle
import json
from tqdm import tqdm
import time

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load question data
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Val_mscoco\v2_OpenEnded_mscoco_val2014_questions.json',
          'r') as f:
    questions = json.load(f)['questions']

# Get unique image IDs (as a list)
image_ids = list({q['image_id'] for q in questions})

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load ResNet model (remove final classification layer)
resnet = models.resnet50(pretrained=True).to(device)
resnet.eval()
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])

# Image directory
image_dir = r'D:\Projects\asfdgfhjghk\dataset\val2014\val2014'
image_features = {}
batch_size = 64

# Verify directory
if not os.path.exists(image_dir):
    print(f"Error: Directory not found: {image_dir}")
    exit(1)
else:
    print(f"Directory found: {image_dir}")
    print(f"Number of files: {len(os.listdir(image_dir))}")

# Process images in batches
start_time = time.time()
batch_images = []
batch_image_ids = []

for i, image_id in enumerate(tqdm(image_ids, desc="Processing val images")):
    image_filename = f"COCO_val2014_{str(image_id).zfill(12)}.jpg"
    image_path = os.path.join(image_dir, image_filename)

    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        continue

    try:
        image = Image.open(image_path).convert('RGB')
        image = transform(image)
        batch_images.append(image)
        batch_image_ids.append(image_id)
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        continue

    if len(batch_images) == batch_size or i == len(image_ids) - 1:
        if batch_images:
            try:
                images = torch.stack(batch_images).to(device)
                with torch.no_grad():
                    features = resnet(images)
                    features = features.squeeze(-1).squeeze(-1).cpu().numpy()
                for img_id, feature in zip(batch_image_ids, features):
                    image_features[img_id] = feature
                del images, features
                torch.cuda.empty_cache()
            except RuntimeError as e:
                print(f"GPU error: {e}")
        batch_images = []
        batch_image_ids = []

# Save features
with open('val_image_features.pkl', 'wb') as f:
    pickle.dump(image_features, f)

end_time = time.time()
print(f"Extracted features for {len(image_features)} val images")
print(f"Total time: {end_time - start_time:.2f} seconds")


Using device: cuda
Directory found: D:\Projects\asfdgfhjghk\dataset\val2014\val2014
Number of files: 40504


Processing val images: 100%|██████████| 40504/40504 [10:30<00:00, 64.22it/s]


Extracted features for 40504 val images
Total time: 631.07 seconds


In [33]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import json
import pickle
from tqdm import tqdm

# --- Dataset ---
class InMemoryVQADataset(Dataset):
    def __init__(self, meta, seqs, ans, feats):
        self.samples = []
        for i, q in enumerate(meta):
            img_id = q['image_id']
            label = ans[i]
            if label < 0 or img_id not in feats:
                continue
            feat_tensor = torch.tensor(feats[img_id], dtype=torch.float32)
            seq_tensor = torch.tensor(seqs[i], dtype=torch.long)
            lbl_tensor = torch.tensor(label, dtype=torch.long)
            self.samples.append((feat_tensor, seq_tensor, lbl_tensor))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [34]:
paths = {
    'train_json': r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Train_mscoco\v2_OpenEnded_mscoco_train2014_questions.json',
    'train_seq': r'D:\Projects\asfdgfhjghk\PKL\q files\train_question_sequences.pkl',
    'train_ans': r'D:\Projects\asfdgfhjghk\PKL\val ans\train_answer_indices.pkl',
    'train_img': r'D:\Projects\asfdgfhjghk\train_image_features.pkl',
    'val_json': r'D:\Projects\asfdgfhjghk\dataset\v2_Questions_Val_mscoco\v2_OpenEnded_mscoco_val2014_questions.json',
    'val_seq': r'D:\Projects\asfdgfhjghk\PKL\q files\val_question_sequences.pkl',
    'val_ans': r'D:\Projects\asfdgfhjghk\PKL\val ans\val_answer_indices.pkl',
    'val_img': r'D:\Projects\asfdgfhjghk\val_image_features.pkl',
}

# Load training data
with open(paths['train_json'], 'r') as f:
    train_meta = json.load(f)['questions']
train_seqs = pickle.load(open(paths['train_seq'], 'rb'))
train_ans = pickle.load(open(paths['train_ans'], 'rb'))
train_feats = pickle.load(open(paths['train_img'], 'rb'))

# Load validation data
with open(paths['val_json'], 'r') as f:
    val_meta = json.load(f)['questions']
val_seqs = pickle.load(open(paths['val_seq'], 'rb'))
val_ans = pickle.load(open(paths['val_ans'], 'rb'))
val_feats = pickle.load(open(paths['val_img'], 'rb'))

# Create datasets and dataloaders
train_ds = InMemoryVQADataset(train_meta, train_seqs, train_ans, train_feats)
val_ds = InMemoryVQADataset(val_meta, val_seqs, val_ans, val_feats)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, num_workers=0, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)

print(f"Train samples: {len(train_ds)}")
print(f"Val samples: {len(val_ds)}")


Train samples: 412840
Val samples: 197882


In [35]:
class ImprovedVQAModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, img_feat_size, num_classes,
                 lstm_layers=2, num_heads=8, dropout=0.3):
        super().__init__()

        # Text Encoder
        self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.lstm = nn.LSTM(embed_size, hidden_size,
                           num_layers=lstm_layers,
                           bidirectional=True,
                           batch_first=True,
                           dropout=dropout)

        # Image Encoder
        self.img_encoder = nn.Sequential(
            nn.Linear(img_feat_size, hidden_size*2),
            nn.ReLU(),
            nn.LayerNorm(hidden_size*2),
            nn.Dropout(dropout)
        )

        # Attention
        self.cross_attn = nn.MultiheadAttention(
            embed_dim=hidden_size*2,  # Fixed dimension
            num_heads=num_heads,
            dropout=dropout,
            batch_first=True
        )

        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size*4, hidden_size*2),  # Fixed input dimension
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size*2, num_classes)
        )

    def forward(self, img, ques):
        # Text
        emb = self.embed(ques)
        lstm_out, _ = self.lstm(emb)

        # Image
        img_feat = self.img_encoder(img).unsqueeze(1)  # [batch, 1, hidden_size*2]

        # Cross Attention
        attn_out, _ = self.cross_attn(
            query=img_feat,
            key=lstm_out,
            value=lstm_out
        )

        # Combine features
        text_pool = lstm_out.mean(dim=1)  # [batch, hidden_size*2]
        img_pool = attn_out.squeeze(1)    # [batch, hidden_size*2]
        combined = torch.cat([text_pool, img_pool], dim=1)  # [batch, hidden_size*4]

        return self.classifier(combined)

In [37]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float('inf')
        self.counter = 0
        self.early_stop = False

    def __call__(self, val_loss, model, epoch):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            # Save best model
            torch.save(model.state_dict(), 'vqa_model_best.pth')
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                print(f"Early stopping triggered at epoch {epoch + 1}")

# --- Training Setup ---
print(f"Using device: {device}")
vocab_size = 10000
num_answers = 3000
model = ImprovedVQAModel(
    vocab_size=10000,
    embed_size=300,
    hidden_size=512,
    img_feat_size=2048,  # Typical size for ResNet features
    num_classes=3129,    # Common VQA answer space size
    lstm_layers=2,
    num_heads=8,
    dropout=0.3,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)
early_stopping = EarlyStopping(patience=3, min_delta=0.001)

# --- Training Loop ---
epochs = 100
for epoch in range(epochs):
    model.train()
    running_loss = 0
    correct = 0
    total = 0
    for img_feats, questions, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
        img_feats, questions, labels = img_feats.to(device), questions.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(img_feats, questions)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * img_feats.size(0)
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    scheduler.step()
    train_loss = running_loss / total
    train_acc = correct / total
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")

    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for img_feats, questions, labels in val_loader:
            img_feats, questions, labels = img_feats.to(device), questions.to(device), labels.to(device)
            outputs = model(img_feats, questions)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * img_feats.size(0)
            preds = outputs.argmax(dim=1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)
    val_loss = val_loss / val_total
    val_acc = val_correct / val_total
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    # Save checkpoint
    torch.save(model.state_dict(), f'vqa_model_epoch_{epoch+1}.pth')

    # Early stopping
    early_stopping(val_loss, model, epoch)
    if early_stopping.early_stop:
        print("Loading best model from early stopping")
        model.load_state_dict(torch.load('vqa_model_best.pth'))
        break

Using device: cuda


Epoch 1/100: 100%|██████████| 6451/6451 [03:47<00:00, 28.36it/s]


Train Loss: 2.8670, Train Acc: 0.2989
Val Loss: 2.3815, Val Acc: 0.3400


Epoch 2/100: 100%|██████████| 6451/6451 [03:45<00:00, 28.63it/s]


Train Loss: 2.2494, Train Acc: 0.3517
Val Loss: 2.1278, Val Acc: 0.3748


Epoch 3/100: 100%|██████████| 6451/6451 [03:39<00:00, 29.43it/s]


Train Loss: 2.0243, Train Acc: 0.3780
Val Loss: 2.0226, Val Acc: 0.3916


Epoch 4/100: 100%|██████████| 6451/6451 [03:36<00:00, 29.77it/s]


Train Loss: 1.8524, Train Acc: 0.4024
Val Loss: 1.9628, Val Acc: 0.4054


Epoch 5/100: 100%|██████████| 6451/6451 [03:35<00:00, 29.87it/s]


Train Loss: 1.7700, Train Acc: 0.4152
Val Loss: 1.9521, Val Acc: 0.4110


Epoch 6/100: 100%|██████████| 6451/6451 [03:36<00:00, 29.80it/s]


Train Loss: 1.7079, Train Acc: 0.4267
Val Loss: 1.9474, Val Acc: 0.4146


Epoch 7/100: 100%|██████████| 6451/6451 [03:35<00:00, 29.87it/s]


Train Loss: 1.6144, Train Acc: 0.4451
Val Loss: 1.9735, Val Acc: 0.4225


Epoch 8/100: 100%|██████████| 6451/6451 [03:35<00:00, 29.94it/s]


Train Loss: 1.5697, Train Acc: 0.4546
Val Loss: 1.9821, Val Acc: 0.4241


Epoch 9/100: 100%|██████████| 6451/6451 [03:35<00:00, 29.92it/s]


Train Loss: 1.5332, Train Acc: 0.4630
Val Loss: 2.0121, Val Acc: 0.4263
Early stopping triggered at epoch 9
Loading best model from early stopping


In [31]:
import pickle
with open(r'D:\Projects\asfdgfhjghk\PKL\vocabs\answer_vocab.pkl', 'rb') as f:
    answer_vocab = pickle.load(f)
print(f"Number of answers: {len(answer_vocab)}")
print(f"Sample answers: {list(answer_vocab.items())[:5]}")
# Check answer distribution in annotations
import json
from collections import Counter
with open(r'D:\Projects\asfdgfhjghk\dataset\v2_Annotations_Train_mscoco\v2_mscoco_train2014_annotations.json', 'r') as f:
    annotations = json.load(f)['annotations']
answers = [Counter(a['answer'].lower() for a in ann['answers']).most_common(1)[0][0] for ann in annotations[:10000]]
print(f"Top 5 answers in data: {Counter(answers).most_common(5)}")

Number of answers: 3000
Sample answers: [('no', 0), ('yes', 1), ('2', 2), ('1', 3), ('white', 4)]
Top 5 answers in data: [('yes', 1890), ('no', 1804), ('1', 280), ('2', 245), ('white', 189)]
