# 1. Introduction

Step 1: Loading the Dataset

In [2]:
from datasets import load_dataset

# Load the VQA-RAD dataset
dataset = load_dataset("flaviagiammarino/vqa-rad")

# Split into training and validation sets
train_dataset = dataset['train']
val_dataset = dataset['test']  # Assuming there is a validation split

Found cached dataset parquet (/Users/hemang/.cache/huggingface/datasets/flaviagiammarino___parquet/flaviagiammarino--vqa-rad-d04980c9c3579419/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

Step 2: Feature Extraction

In [3]:
import torch
from torchvision import models, transforms
from transformers import AutoTokenizer, AutoModel

# Check if MPS (Metal Performance Shaders) is available and set the device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Load and configure ResNet-50 for image feature extraction
resnet50 = models.resnet50(pretrained=True).to(device)
resnet50.eval()

# Define image transformations
image_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def extract_image_features(image):
    image = image_transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        features = resnet50(image)
    return features.squeeze().cpu()

# Load and configure RoBERTa for question feature extraction
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
roberta = AutoModel.from_pretrained("roberta-large").to(device)
roberta.eval()

def extract_question_features(question):
    inputs = tokenizer(question, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = roberta(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu()

  Referenced from: <5AA8DD3D-A2CC-31CA-8060-88B4E9C18B09> /Users/hemang/anaconda3/lib/python3.10/site-packages/torchvision/image.so
  warn(
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Step 3: Applying Feature Extraction

In [6]:
def extract_and_store_features(dataset):
    def extract_features(example):
        image_features = extract_image_features(example['image'])
        question_features = extract_question_features(example['question'])
        return {
            'image_features': image_features.tolist(),
            'question_features': question_features.tolist(),
            'label': example['answer']  # Assuming the label is in the 'answer' field
        }
    
    # Apply the feature extraction
    dataset = dataset.map(extract_features, batched=False)
    return dataset

# Apply extraction to both the train and validation datasets
train_dataset = extract_and_store_features(train_dataset)
val_dataset = extract_and_store_features(val_dataset)

Map:   0%|          | 0/1793 [00:00<?, ? examples/s]

Loading cached processed dataset at /Users/hemang/.cache/huggingface/datasets/flaviagiammarino___parquet/flaviagiammarino--vqa-rad-d04980c9c3579419/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-a790d1ce3bc06154.arrow


Step 4: Feature Fusion

In [7]:
import torch

def fuse_features(example):
    try:
        # Ensure tensors are properly created
        image_features_tensor = torch.tensor(example['image_features'])
        question_features_tensor = torch.tensor(example['question_features'])
        
        # Perform the feature fusion
        fused_features = torch.cat((image_features_tensor, question_features_tensor), dim=0)
        
        return {'fused_features': fused_features.tolist(), 'label': example['label']}
    except Exception as e:
        print(f"Error during feature fusion: {e}")
        return {'fused_features': [], 'label': example['label']}  # Return empty features in case of error

# Apply feature fusion to the train and validation datasets
train_dataset = train_dataset.map(fuse_features, batched=False)
val_dataset = val_dataset.map(fuse_features, batched=False)

# Optionally, remove the individual feature columns
train_dataset = train_dataset.remove_columns(['image_features', 'question_features'])
val_dataset = val_dataset.remove_columns(['image_features', 'question_features'])

# Check a sample from the datasets
print(train_dataset[0])
print(val_dataset[0])

Map:   0%|          | 0/1793 [00:00<?, ? examples/s]

Map:   0%|          | 0/451 [00:00<?, ? examples/s]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=566x555 at 0x34543DFC0>, 'question': 'are regions of the brain infarcted?', 'answer': 'yes', 'label': 'yes', 'fused_features': [-0.5867313146591187, 0.018025249242782593, -2.461169719696045, -2.492278575897217, -2.0478804111480713, 1.9735158681869507, -1.9311087131500244, -1.4993816614151, -1.4912445545196533, -1.7459769248962402, -1.1166762113571167, -1.999479055404663, -3.031773090362549, -1.2133901119232178, -3.186807155609131, -1.6172490119934082, -2.607880115509033, -2.807157278060913, -2.205174207687378, -1.7052876949310303, -3.6695377826690674, -3.243039608001709, -2.967146396636963, -3.021780014038086, -1.672594666481018, 0.02579115331172943, 1.0887867212295532, 1.8711848258972168, 0.3509728014469147, 0.4729272723197937, -2.0176358222961426, -1.7485578060150146, -1.6530791521072388, -1.1572685241699219, 1.1299530267715454, 0.10846003890037537, -0.10578027367591858, -1.4121290445327759, 3.0118134021759033, 0.217467

Inspect Unique Labels

In [9]:
# Unique labels in training and validation datasets
train_labels = set(train_dataset['label'])
val_labels = set(val_dataset['label'])

print("Training labels:", train_labels)
print("Validation labels:", val_labels)

# Find labels in validation that are not in training
missing_labels = val_labels - train_labels
print("Labels in validation set but not in training set:", missing_labels)


Training labels: {'pineal region', 'air fluid level', 'the aorta', 'almost entire right side', 'cancer', 'imaging artifacts', 'right lenticular nucleus', 'right subclavian vein', 'breasts', 'vasculature', 'more acute means more inflammation-leading to enhancement?', 'l2', 'upper lobes', 'hepatocellular carcioma', 'horsehoe kidney', 'temporal and lateral occipital lobes', 'just one', 'not sure', 'hemorrhage', 'the aorta and the inferior vena cava', 'atherosclerotic calcification', 'yes', 'all three vascular distributions', 'cystic duct is more tortuous', 'cardiac region', 'semi-upright position', 'brain', 'hydropneumothorax', 'respiratory system', 'gastrointestinal', 'central hyperintensity and surrounding hypointensity', 'flair', 'right sided pleural effusion', 'medial rectus', 'right upper lobe', 'in the midline', 'lateral ventricles', 'right lobe', 'left temporal lobe', 'large bowel', 'acute stroke', 'abscess', 'chest x-ray', 'right sylvian fissure', 'scoliosis', 'gi', 'below the 7th

Update Label Mapping

In [11]:
# Update label mapping to include all labels from both training and validation sets
all_labels = train_labels.union(val_labels)
label_to_index = {label: idx for idx, label in enumerate(all_labels)}

# Print updated mapping
print("Updated label mapping:", label_to_index)


Updated label mapping: {'mri-dwi': 0, 'pineal region': 1, 'sharp costophrenic angles': 2, 'left apical pneumothorax': 3, 'right lung': 4, 'left thalamus and basal ganglia': 5, 'air fluid level': 6, 'caudate, putamen, left parietal': 7, 'the aorta': 8, 'almost entire right side': 9, 'cancer': 10, 'imaging artifacts': 11, 'right lenticular nucleus': 12, 'sinusitis': 13, 't2 weighted mri': 14, 'right vs left sided pathology': 15, 'enlarged': 16, 'plain film': 17, 'axial': 18, 'right subclavian vein': 19, 'spleen': 20, 'breasts': 21, 'mri-flair': 22, 'periappendiceal fluid and fat stranding': 23, 'vasculature': 24, 'more acute means more inflammation-leading to enhancement?': 25, 'l2': 26, 'left mid lung': 27, 'cardiovascular': 28, 'the diaphragm': 29, 'upper lobes': 30, 'the pancreatic head': 31, 'hepatocellular carcioma': 32, 'hypodense': 33, 'basal ganglia (caudate and putamen)': 34, 'horsehoe kidney': 35, 'temporal and lateral occipital lobes': 36, 'just one': 37, 'cavum vergae': 38, '

5. Prepare DataLoaders

In [14]:
from torch.utils.data import DataLoader, TensorDataset

def prepare_dataloader(dataset, label_to_index, batch_size=32, shuffle=True):
    # Convert labels from strings to integers using the updated mapping
    labels = torch.tensor([label_to_index.get(label, -1) for label in dataset['label']])
    
    # Ensure there are no invalid labels (e.g., labels not seen in training)
    if (labels < 0).any():
        # Filter out invalid labels
        valid_indices = (labels >= 0).nonzero(as_tuple=True)[0]
        labels = labels[valid_indices]
        features = torch.tensor(dataset['fused_features'])[valid_indices]
    else:
        features = torch.tensor(dataset['fused_features'])
    
    # Create a TensorDataset
    tensor_dataset = TensorDataset(features, labels)
    
    # Create a DataLoader
    dataloader = DataLoader(tensor_dataset, batch_size=batch_size, shuffle=shuffle)
    
    return dataloader

# Prepare DataLoaders for training and validation datasets
train_loader = prepare_dataloader(train_dataset, label_to_index)
val_loader = prepare_dataloader(val_dataset, label_to_index, shuffle=False)

6. Define the Model Architecture

In [25]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class VQAModel(nn.Module):
    def __init__(self, image_feature_dim, question_feature_dim, hidden_dim, output_dim):
        super(VQAModel, self).__init__()
        
        # Image feature processing
        self.image_fc = nn.Linear(image_feature_dim, hidden_dim)
        
        # Question feature processing
        self.question_fc = nn.Linear(question_feature_dim, hidden_dim)
        
        # LSTM layer for sequence modeling
        self.lstm = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True)
        
        # Final output layer
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        
        # Dropout layer for regularization
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, image_features, question_features):
        # Process image features
        image_features = F.relu(self.image_fc(image_features))
        
        # Process question features
        question_features = F.relu(self.question_fc(question_features))
        
        # Concatenate image and question features
        combined_features = torch.cat((image_features.unsqueeze(1), question_features.unsqueeze(1)), dim=1)
        
        # Sequence modeling with LSTM
        lstm_out, _ = self.lstm(combined_features)
        
        # Use the output from the last LSTM cell
        lstm_out = lstm_out[:, -1, :]
        
        # Apply dropout
        lstm_out = self.dropout(lstm_out)
        
        # Final output layer
        output = self.fc_out(lstm_out)
        
        return output

# Parameters for the model
image_feature_dim = 2048  # Output dimension of ResNet-50
question_feature_dim = 1024  # Example: hidden size of RoBERTa
hidden_dim = 512
output_dim = len(label_to_index)  # Number of unique labels

# Initialize the model
model = VQAModel(image_feature_dim, question_feature_dim, hidden_dim, output_dim).to(device)

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [26]:
# Parameters for the model
image_feature_dim = 2048  # Output dimension of ResNet-50
question_feature_dim = 1024  # Example: Use the hidden size of RoBERTa
hidden_dim = 512
output_dim = len(label_to_index)  # Number of unique labels

# Check dimensions
total_feature_dim = image_feature_dim + question_feature_dim
print(f"Total feature dimension: {total_feature_dim}")

Total feature dimension: 3072


Print Tensor Shapes

In [20]:
print(fused_features.shape)  # Should be (batch_size, total_feature_dim)
print(fused_features[:, :image_feature_dim].shape)  # Should be (batch_size, image_feature_dim)
print(fused_features[:, image_feature_dim:].shape)  # Should be (batch_size, text_feature_dim)

torch.Size([32, 2024])
torch.Size([32, 1024])
torch.Size([32, 1000])


7. Training and Evaluation Loop

In [36]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    
    for fused_features, labels in train_loader:
        fused_features, labels = fused_features.to(device), labels.to(device)
        
        # Ensure correct slicing based on dimensions
        image_features = fused_features[:, :image_feature_dim]
        question_features = fused_features[:, image_feature_dim:]
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(image_features, question_features)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    train_accuracy = 100. * correct / total
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_accuracy:.2f}%')
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for fused_features, labels in val_loader:
            fused_features, labels = fused_features.to(device), labels.to(device)
            
            # Ensure correct slicing based on dimensions
            image_features = fused_features[:, :image_feature_dim]
            question_features = fused_features[:, image_feature_dim:]
            
            outputs = model(image_features, question_features)
            loss = criterion(outputs, labels)
            
            val_loss += loss.item()
            _, predicted = outputs.max(1)
            val_total += labels.size(0)
            val_correct += predicted.eq(labels).sum().item()
    
    val_accuracy = 100. * val_correct / val_total
    print(f'Validation Loss: {val_loss/len(val_loader):.4f}, Validation Accuracy: {val_accuracy:.2f}%')

RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x2024 and 2048x512)