# 1. Introduction

Step 1: Loading the Dataset

In [21]:
from datasets import load_dataset

# Load the VQA-RAD dataset
dataset = load_dataset("flaviagiammarino/vqa-rad")
train_dataset = dataset['train']

# Inspect the dataset
print(train_dataset.column_names)
print(train_dataset[0])

Found cached dataset parquet (/Users/hemang/.cache/huggingface/datasets/flaviagiammarino___parquet/flaviagiammarino--vqa-rad-d04980c9c3579419/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

['image', 'question', 'answer']
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=566x555 at 0x15A8A5030>, 'question': 'are regions of the brain infarcted?', 'answer': 'yes'}


Step 2: Feature Extraction

In [22]:
import torch
from torchvision import models, transforms
from transformers import AutoTokenizer, AutoModel

# Load and configure ResNet-50 for image feature extraction
resnet50 = models.resnet50(pretrained=True)
resnet50.eval()

# Define image transformations
image_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def extract_image_features(image):
    image = image_transform(image).unsqueeze(0)
    with torch.no_grad():
        features = resnet50(image)
    return features.squeeze()

# Load and configure RoBERTa for question feature extraction
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
roberta = AutoModel.from_pretrained("roberta-large")
roberta.eval()

def extract_question_features(question):
    inputs = tokenizer(question, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = roberta(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze()

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Step 3: Applying Feature Extraction

In [23]:
# Apply feature extraction to the train dataset
def extract_and_store_features(dataset):
    def extract_features(example):
        image_features = extract_image_features(example['image'])
        question_features = extract_question_features(example['question'])
        return {
            'image_features': image_features.tolist(),
            'question_features': question_features.tolist()
        }
    
    # Apply the feature extraction
    dataset = dataset.map(extract_features, batched=False)
    return dataset

# Apply extraction to the train dataset
train_dataset = extract_and_store_features(train_dataset)

# Verify the first item in the dataset to ensure features are present
print(train_dataset[0])

Map:   0%|          | 0/1793 [00:00<?, ? examples/s]



KeyboardInterrupt: 

Step 4: Feature Fusion

In [None]:
def fuse_features(example):
    fused_features = torch.cat((torch.tensor(example['image_features']), torch.tensor(example['question_features'])), dim=0)
    return {'fused_features': fused_features.tolist()}

# Apply feature fusion to the train dataset
train_dataset = train_dataset.map(fuse_features, batched=False)

# Optionally, remove the individual feature columns
train_dataset = train_dataset.remove_columns(['image_features', 'question_features'])

# Inspect the fused features
print(train_dataset[0])
