In [None]:
import os
import json
import h5py
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
!pip install tensorboard
!pip install loguru

In [None]:
class VQADataset(Dataset):
    def __init__(self, ques_h5, img_h5, json_file, split='train'):
        self.h5_img = h5py.File(img_h5, 'r')
        self.h5_ques = h5py.File(ques_h5, 'r')
        self.json_data = json.load(open(json_file))
        self.vocab_size = len(self.json_data['ix_to_word'])

        dataset_key = 'train' if split == 'train' else 'test'
        self.img_pos = self.h5_ques[f'img_pos_{dataset_key}'][:]
        self.fv_ims = self.h5_img[f'images_{dataset_key}'][:]
        self.ques = self.h5_ques[f'ques_{dataset_key}'][:]
        self.ques_len = self.h5_ques[f'ques_len_{dataset_key}'][:]
        self.ques_id = self.h5_ques[f'ques_id_{dataset_key}'][:]
        if dataset_key == 'test':
            self.answer = self.h5_ques[f'ans_{dataset_key}'][:]
        else:
            self.answer = self.h5_ques[f'answers'][:]

    def __len__(self):
        return len(self.ques_id)

    def __getitem__(self, idx):
        return {
            'image': torch.tensor(self.fv_ims[self.img_pos[idx]], dtype=torch.float),
            'question': torch.tensor(self.ques[idx], dtype=torch.long),
            'question_len': torch.tensor(self.ques_len[idx], dtype=torch.long),
            'answer': torch.tensor(self.answer[idx], dtype=torch.long)
        }

# Baseline LSTM+CNN

In [None]:
class QuestionEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_size, dropout=0.5):
        super(QuestionEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.tanh = nn.Tanh()
    def forward(self, x):
        # x: (batch, seq_len)
        x = self.embedding(x)         # (batch, seq_len, embedding_size)
        x = self.dropout(x)
        x = self.tanh(x)
        return x

class QuestionEncoder(nn.Module):
    def __init__(self, embedding_size, lstm_size, num_layers, dropout=0.5):
        super(QuestionEncoder, self).__init__()
        self.lstm = nn.LSTM(
            input_size=embedding_size,
            hidden_size=lstm_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
    def forward(self, embeddings, lengths):
        # Pack padded sequence for efficient processing
        packed = nn.utils.rnn.pack_padded_sequence(embeddings, lengths.cpu(), batch_first=True, enforce_sorted=True)
        packed_out, (hn, cn) = self.lstm(packed)
        # Use the hidden state from the last layer (hn: [num_layers, batch, lstm_size])
        return hn[-1]  # (batch, lstm_size)

In [None]:
import torch
import torch.nn as nn

class MultimodalNet(nn.Module):
    def __init__(self, q_dim, i_dim, common_embedding_size, noutput, dropout=0.5):
        super(MultimodalNet, self).__init__()

        # Projections for question (q) and image (i)
        self.q_proj = nn.Sequential(
            nn.Linear(q_dim, common_embedding_size),
            nn.Tanh()
        )
        self.i_proj = nn.Sequential(
            nn.Linear(i_dim, common_embedding_size),
            nn.Tanh()
        )

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

        # Output layer (following the same structure as the Lua code)
        self.out = nn.Linear(common_embedding_size, noutput)

    def forward(self, q, i):
        q_proj = self.q_proj(q)
        i_proj = self.i_proj(i)

        q_proj = self.dropout(q_proj)
        i_proj = self.dropout(i_proj)

        # Element-wise multiplication of q_proj and i_proj (similar to CMulTable in Lua)
        x = q_proj * i_proj

        # Return the final output through the linear layer
        output = self.out(x)

        return output


In [None]:
class VQAModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, lstm_size, num_layers,
                 image_feat_dim, common_embedding_size, noutput, dropout=0.5):
        super(VQAModel, self).__init__()
        self.embedding_net = QuestionEmbedding(vocab_size, embedding_size, dropout)
        self.encoder = QuestionEncoder(embedding_size, lstm_size, num_layers, dropout)
        self.multimodal = MultimodalNet(lstm_size, image_feat_dim, common_embedding_size, noutput, dropout)
    def forward(self, question, lengths, image):
        # question: (batch, seq_len)
        # lengths: (batch,)
        embeddings = self.embedding_net(question)            # (batch, seq_len, embedding_size)
        q_encoded = self.encoder(embeddings, lengths)          # (batch, lstm_size)
        scores = self.multimodal(q_encoded, image)             # (batch, noutput)
        return scores

In [None]:
def collate_fn(batch):
    # Sắp xếp batch theo độ dài câu giảm dần
    batch.sort(key=lambda x: x['question_len'], reverse=True)

    # Ghép các giá trị thành batch tensor
    questions = torch.stack([item['question'] for item in batch], dim=0)
    lengths = torch.tensor([item['question_len'] for item in batch])
    images = torch.stack([item['image'] for item in batch], dim=0)
    answers = torch.tensor([item['answer'] for item in batch])

    return questions, lengths, images, answers


In [None]:
def compute_accuracy(outputs, labels):
    preds = torch.argmax(outputs, dim=1)
    correct = (preds == labels).sum().item()
    return correct / labels.size(0)

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.amp import GradScaler, autocast
from torch.utils.data import DataLoader, random_split
from tqdm import tqdm
from loguru import logger
from torch.utils.tensorboard import SummaryWriter  # Import TensorBoard

# Cấu hình Loguru để dễ đọc log trên Kaggle
logger.remove()
logger.add(lambda msg: tqdm.write(msg, end=""), level="INFO")


def train():
    # Parameters
    input_img_h5 = '/kaggle/input/meta-data/data_img.h5'
    input_ques_h5 = '/kaggle/input/meta-data/cocoqa_data_prepro.h5'
    input_json = '/kaggle/input/meta-data/cocoqa_data_prepro.json'
    learning_rate = 5e-5
    batch_size = 16
    max_iters = 10000
    input_encoding_size = 200
    rnn_size = 512
    rnn_layers = 2
    common_embedding_size = 1024
    noutput = 19
    checkpoint_path = 'model/'
    log_dir = 'logs/'  # Thư mục lưu TensorBoard logs
    seed = 42
    patience = 5  # Early stopping patience

    # Set random seed và device
    torch.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")

    # Tạo thư mục lưu model và logs nếu chưa có
    os.makedirs(checkpoint_path, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)

    # Khởi tạo TensorBoard writer
    writer = SummaryWriter(log_dir)

    # Load dataset
    logger.info("Loading dataset...")
    full_dataset = VQADataset(input_ques_h5, input_img_h5, input_json, split='train')
    # Chia train/val (80/20)
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

    logger.info(f"Total dataset size: {len(full_dataset)}")
    logger.info(f"Train dataset size: {train_size}")
    logger.info(f"Validation dataset size: {val_size}")

    # DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    # Khởi tạo mô hình
    logger.info("Building model...")
    model = VQAModel(
        vocab_size=full_dataset.vocab_size,
        embedding_size=input_encoding_size,
        lstm_size=rnn_size,
        num_layers=rnn_layers,
        image_feat_dim=4096,
        common_embedding_size=common_embedding_size,
        noutput=noutput,
        dropout=0.5
    ).to(device)

    # Loss và optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    scaler = GradScaler()

    # Early Stopping
    best_val_loss = float('inf')
    patience_counter = 0

    logger.info("🚀 Starting training...")

    for iteration in range(max_iters):
        model.train()
        running_train_loss = 0.0
        running_train_acc = 0.0

        # Training Loop
        for batch_idx, (questions, lengths, images, answers) in enumerate(tqdm(train_loader, desc=f"Epoch {iteration+1}")):
            questions, lengths, images, answers = (questions.to(device),
                                                   lengths.to(device),
                                                   images.to(device),
                                                   answers.to(device))

            optimizer.zero_grad()

            with torch.amp.autocast(device_type='cuda'):
                outputs = model(questions, lengths, images)
                loss = criterion(outputs, answers)

            scaler.scale(loss).backward()

            # Gradient clipping
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model.parameters(), 10)

            scaler.step(optimizer)
            scaler.update()

            running_train_loss += loss.item()
            running_train_acc += compute_accuracy(outputs, answers)  # Tính accuracy cho batch

        # Tính loss & accuracy trung bình
        train_loss = running_train_loss / len(train_loader)
        train_acc = running_train_acc / len(train_loader)
        writer.add_scalar("Loss/Train", train_loss, iteration)
        writer.add_scalar("Accuracy/Train", train_acc, iteration)

        # Validation Loop
        model.eval()
        running_val_loss = 0.0
        running_val_acc = 0.0
        with torch.no_grad():
            for questions, lengths, images, answers in val_loader:
                questions, lengths, images, answers = (questions.to(device),
                                                       lengths.to(device),
                                                       images.to(device),
                                                       answers.to(device))

                with torch.amp.autocast(device_type='cuda'):
                    outputs = model(questions, lengths, images)
                    loss = criterion(outputs, answers)

                running_val_loss += loss.item()
                running_val_acc += compute_accuracy(outputs, answers)

        # Tính loss & accuracy trung bình cho validation
        val_loss = running_val_loss / len(val_loader)
        val_acc = running_val_acc / len(val_loader)
        writer.add_scalar("Loss/Validation", val_loss, iteration)
        writer.add_scalar("Accuracy/Validation", val_acc, iteration)

        logger.info(f"📊 Iter {iteration+1} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

        # Kiểm tra Early Stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_path = os.path.join(checkpoint_path, 'best_model.pth')
            torch.save(model.state_dict(), best_model_path)
            logger.success(f"✅ New best model saved at {best_model_path} (Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f})")
        else:
            patience_counter += 1
            logger.warning(f"⏳ Early stopping counter: {patience_counter}/{patience}")
            if patience_counter >= patience:
                logger.error("⛔ Early stopping triggered! Training stopped.")
                break

    # Save model cuối cùng
    final_ckpt = os.path.join(checkpoint_path, 'lstm_final.pth')
    torch.save(model.state_dict(), final_ckpt)
    logger.success(f"🏆 Final model saved to {final_ckpt}")

    # Đóng TensorBoard writer
    writer.close()


In [None]:
train()

# Alternating Co-Attention

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as fn
from torch.nn.utils.rnn import pad_packed_sequence
class CoattentionNet(nn.Module):
    """
    Predicts an answer to a question about an image using Alternating Co-Attention only.
    """
    def __init__(self, num_embeddings, num_classes, embed_dim=512):
        super().__init__()

        self.embed = nn.Embedding(num_embeddings, embed_dim)

        self.unigram_conv = nn.Conv1d(embed_dim, embed_dim, 1, stride=1, padding=0)
        self.bigram_conv  = nn.Conv1d(embed_dim, embed_dim, 2, stride=1, padding=1, dilation=2)
        self.trigram_conv = nn.Conv1d(embed_dim, embed_dim, 3, stride=1, padding=2, dilation=2)
        self.max_pool = nn.MaxPool2d((3, 1))
        self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=embed_dim, num_layers=3, dropout=0.4)
        self.tanh = nn.Tanh()

        # Alternating Co-Attention parameters
        self.W_x = nn.Linear(embed_dim, embed_dim)
        self.W_g = nn.Linear(embed_dim, embed_dim)
        self.w_hx = nn.Linear(embed_dim, 1)

        # Fully connected layers for answer prediction
        self.W_w = nn.Linear(embed_dim, embed_dim)
        self.W_p = nn.Linear(embed_dim * 2, embed_dim)
        self.W_s = nn.Linear(embed_dim * 2, embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, question, length, image):
        # Embedding & convs
        words = self.embed(question).permute(0, 2, 1)  # [B, D, T]

        unigrams = torch.unsqueeze(self.tanh(self.unigram_conv(words)), 2)
        bigrams  = torch.unsqueeze(self.tanh(self.bigram_conv(words)), 2)
        trigrams = torch.unsqueeze(self.tanh(self.trigram_conv(words)), 2)

        words = words.permute(0, 2, 1)  # [B, T, D] back for co-attn later

        # Phrase-level features
        phrase = torch.squeeze(self.max_pool(torch.cat((unigrams, bigrams, trigrams), 2)))  # [B, D, T]
        phrase = phrase.permute(0, 2, 1)  # [B, T, D] for LSTM

        # Pack LSTM input
        packed = nn.utils.rnn.pack_padded_sequence(phrase, length.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed)
        sentence, _ = pad_packed_sequence(packed_output, batch_first=True)  # [B, T, D]

        image = image.view(image.shape[0], 512, -1)  # [B, 512, 196]

        # Co-attention at each level
        v_word, q_word = self.alternating_co_attention(words, image)
        v_phrase, q_phrase = self.alternating_co_attention(phrase, image)
        v_sent, q_sent = self.alternating_co_attention(sentence, image)

        h_w = self.tanh(self.W_w(q_word + v_word))
        h_p = self.tanh(self.W_p(torch.cat(((q_phrase + v_phrase), h_w), dim=1)))
        h_s = self.tanh(self.W_s(torch.cat(((q_sent + v_sent), h_p), dim=1)))

        logits = self.fc(h_s)
        return logits


    def alternating_co_attention(self, Q, V):
        """
        Q: [B, T, D]
        V: [B, 512, 196] → needs to be transposed to [B, 196, 512]
        """
        # Ensure V shape is compatible
        V = V.permute(0, 2, 1)  # [B, 196, 512]

        # Step 1: Attend to the question
        H_q = self.tanh(self.W_x(Q))  # [B, T, D]
        a_q = fn.softmax(self.w_hx(H_q), dim=1)  # [B, T, 1]
        attended_q = torch.sum(a_q * Q, dim=1)   # [B, D]

        # Step 2: Attend to the image using attended question
        H_v = self.tanh(self.W_x(V) + self.W_g(attended_q).unsqueeze(1))  # [B, 196, D]
        a_v = fn.softmax(self.w_hx(H_v), dim=1)  # [B, 196, 1]
        attended_v = torch.sum(a_v * V, dim=1)   # [B, D]

        # Step 3: Attend back to the question using attended image
        H_q_final = self.tanh(self.W_x(Q) + self.W_g(attended_v).unsqueeze(1))  # [B, T, D]
        a_q_final = fn.softmax(self.w_hx(H_q_final), dim=1)  # [B, T, 1]
        final_q = torch.sum(a_q_final * Q, dim=1)            # [B, D]

        return attended_v, final_q



In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import h5py
import json
from torch.utils.data import Dataset, DataLoader, random_split
from torch.amp import GradScaler, autocast
from tqdm import tqdm
from loguru import logger
from torch.utils.tensorboard import SummaryWriter  # TensorBoard logging

# Logging configuration
logger.remove()
logger.add(lambda msg: tqdm.write(msg, end=""), level="INFO")

def train():
    # Parameters
    input_img_h5 = '/kaggle/input/cnn-image/data_img_att.h5'
    input_ques_h5 = '/kaggle/input/meta-data/cocoqa_data_prepro.h5'
    input_json = '/kaggle/input/meta-data/cocoqa_data_prepro.json'
    learning_rate = 1e-5
    batch_size = 16
    max_iters = 10000
    embed_dim = 512
    num_layers = 3
    noutput = 19
    checkpoint_path = 'model_att/'
    log_dir = 'logs_att/'
    seed = 42
    patience = 5  # Early stopping patience

    # Set random seed and device
    torch.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")

    # Create directories if not exist
    os.makedirs(checkpoint_path, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)

    # TensorBoard writer
    writer = SummaryWriter(log_dir)

    # Load dataset
    logger.info("Loading dataset...")
    full_dataset = VQADataset(input_ques_h5, input_img_h5, input_json, split='train')

    # Train-validation split (80/20)
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

    logger.info(f"Total dataset: {len(full_dataset)}")
    logger.info(f"Train dataset: {train_size}")
    logger.info(f"Validation dataset: {val_size}")

    # DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model
    logger.info("Building Co-Attention model...")
    model = CoattentionNet(
        num_embeddings=full_dataset.vocab_size,
        num_classes=noutput,
        embed_dim=embed_dim
    ).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    scaler = GradScaler()

    # Early Stopping
    best_val_loss = float('inf')
    patience_counter = 0

    logger.info("🚀 Starting training...")

    for iteration in range(max_iters):
        model.train()
        running_train_loss = 0.0
        running_train_acc = 0.0

        for batch in tqdm(train_loader, desc=f"Epoch {iteration+1}"):
            images, questions, answers, lengths = batch['image'].to(device), batch['question'].to(device), batch['answer'].to(device), batch['lengths'].to(device)
            optimizer.zero_grad()

            with autocast(device_type='cuda'):
                outputs = model(questions, lengths, images)
                loss = criterion(outputs, answers)

            scaler.scale(loss).backward()

            # Gradient clipping
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model.parameters(), 10)

            scaler.step(optimizer)
            scaler.update()

            running_train_loss += loss.item()
            running_train_acc += compute_accuracy(outputs, answers)

        # Compute average loss & accuracy
        train_loss = running_train_loss / len(train_loader)
        train_acc = running_train_acc / len(train_loader)
        writer.add_scalar("Loss/Train", train_loss, iteration)
        writer.add_scalar("Accuracy/Train", train_acc, iteration)

        # Validation Loop
        model.eval()
        running_val_loss = 0.0
        running_val_acc = 0.0
        with torch.no_grad():
            for batch in val_loader:
                images, questions, answers = batch['image'].to(device), batch['question'].to(device), batch['answer'].to(device)

                with autocast(device_type='cuda'):
                    outputs = model(images, questions)
                    loss = criterion(outputs, answers)

                running_val_loss += loss.item()
                running_val_acc += compute_accuracy(outputs, answers)

        val_loss = running_val_loss / len(val_loader)
        val_acc = running_val_acc / len(val_loader)
        writer.add_scalar("Loss/Validation", val_loss, iteration)
        writer.add_scalar("Accuracy/Validation", val_acc, iteration)

        logger.info(f"📊 Iter {iteration+1} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

        # Early Stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_path = os.path.join(checkpoint_path, 'best_model.pth')
            torch.save(model.state_dict(), best_model_path)
            logger.success(f"✅ Best model saved at {best_model_path} (Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f})")
        else:
            patience_counter += 1
            logger.warning(f"⏳ Early stopping counter: {patience_counter}/{patience}")
            if patience_counter >= patience:
                logger.error("⛔ Early stopping triggered! Training stopped.")
                break

    # Save final model
    final_ckpt = os.path.join(checkpoint_path, 'final_coattention.pth')
    torch.save(model.state_dict(), final_ckpt)
    logger.success(f"🏆 Final model saved to {final_ckpt}")

    # Close TensorBoard writer
    writer.close()


In [None]:
train()

# Tensorboard

In [None]:
!pip install pyngrok
!ngrok authtoken 2ucmJoehETJPeF9zgDhTXhs7moj_4hGJMpihS1GcLN8AghBZC

In [None]:
from pyngrok import ngrok
import os
import threading

# Kill any existing TensorBoard processes
os.system("pkill -f tensorboard")

# Start TensorBoard in a separate thread
def run_tensorboard():
    os.system("tensorboard --logdir=logs_att --host 0.0.0.0 --port 6006")

threading.Thread(target=run_tensorboard, daemon=True).start()

# Create a public URL for TensorBoard
tb_url = ngrok.connect(6006).public_url
print(f"Open TensorBoard: {tb_url}")


# Evaluate

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from loguru import logger

def evaluate_model(model, dataset, batch_size=16, collate_fn=None,best_model_path = None):
    """
    Evaluate any VQA model with top-k label filtering support.

    Args:
        model: Trained PyTorch model (MultimodalNet, CoattentionNet, etc.)
        dataset: A torch.utils.data.Dataset instance for evaluation.
        batch_size: Batch size for evaluation.
        collate_fn: Optional collate_fn for DataLoader.
    """
    logger.info("🔍 Starting evaluation...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Load best checkpoint
    model.load_state_dict(torch.load(best_model_path, map_location=device, weights_only=True))
    model.eval()

    test_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    criterion = nn.CrossEntropyLoss()

    running_test_loss = 0.0
    running_test_acc = 0.0

    all_preds = []
    all_labels = []

    num_valid_samples = 0

    with torch.no_grad():
        for questions, lengths, images, answers in tqdm(test_loader, desc="Evaluating"):
            questions, lengths, images, answers = (questions.to(device),
                                                   lengths.to(device),
                                                   images.to(device),
                                                   answers.to(device))

            valid_mask = answers != 19
            if valid_mask.sum() == 0:
                continue  # Skip if no valid samples

            questions = questions[valid_mask]
            lengths = lengths[valid_mask]
            images = images[valid_mask]
            answers = answers[valid_mask]

            with torch.amp.autocast(device_type='cuda'):
                outputs = model(questions, lengths, images)
                loss = criterion(outputs, answers)

            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(answers.cpu().tolist())

            running_test_loss += loss.item() * answers.size(0)  # multiply by batch size
            running_test_acc += compute_accuracy(outputs, answers) * answers.size(0)
            num_valid_samples += answers.size(0)

    # Final average over valid samples
    test_loss = running_test_loss / num_valid_samples
    test_acc = running_test_acc / num_valid_samples
    logger.info(f"Number of valid test: {num_valid_samples} / {len(test_dataset)}")
    logger.info(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_acc:.4f}")


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
input_img_h5 = '/kaggle/input/meta-data/data_img.h5'
input_ques_h5 = '/kaggle/input/meta-data/cocoqa_data_prepro.h5'
input_json = '/kaggle/input/meta-data/cocoqa_data_prepro.json'

test_dataset = VQADataset(input_ques_h5, input_img_h5, input_json, split='test')

## Baseline Model Eval

In [None]:
input_encoding_size = 200
rnn_size = 512
rnn_layers = 2
common_embedding_size = 1024
noutput=19

# Load model & set to eval mode
model = VQAModel(
    vocab_size=test_dataset.vocab_size,
    embedding_size=input_encoding_size,
    lstm_size=rnn_size,
    num_layers=rnn_layers,
    image_feat_dim=4096,
    common_embedding_size=common_embedding_size,
    noutput=noutput,
    dropout=0.5
).to(device)

best_model_path = '/kaggle/input/vqa-best/pytorch/default/1/baseline_model.pth'
evaluate_model(
    model=model,
    dataset=test_dataset,
    batch_size=16,
    collate_fn=collate_fn,
    best_model_path = best_model_path
)


## Attention Model Eval

In [None]:
embed_dim = 512
model = CoattentionNet(
        num_embeddings=test_dataset.vocab_size,
        num_classes=noutput,
        embed_dim=embed_dim
    ).to(device)

best_model_path = '/kaggle/input/vqa-best/pytorch/default/1/attention_model.pth'
evaluate_model(
    model=model,
    dataset=test_dataset,
    batch_size=16,
    collate_fn=collate_fn,
    best_model_path = best_model_path
)