# Notebook 1: RoBERTa for Classification

This notebook will handle binary classification (yes/no) using RoBERTa for question embedding and ResNet-50 for image feature extraction.

### Data Loading and Preprocessing:
- Load the VQA-RAD dataset.
- Preprocess the radiology images using OpenCV and resize them.
- Tokenize the questions using the RoBERTa tokenizer.

### Feature Extraction:

- Extract image features using ResNet-50 (pre-trained on ImageNet).
- Tokenize and extract question embeddings using RoBERTa from Hugging Face.

### Feature Fusion:

- Fuse the image features and question embeddings using NumPy.

### Model Definition (Binary Classification):

- Define a simple model that concatenates the fused features and passes them through a dense softmax classifier to predict yes/no answers.

### Training and Evaluation:
- Train the model using categorical cross-entropy and an optimizer (e.g., Adam).
- Evaluate the model on the validation set.

In [16]:
import numpy as np
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout, Flatten
from tensorflow.keras.regularizers import l2
from datasets import load_dataset


# Load the dataset
dataset = load_dataset("flaviagiammarino/vqa-rad")

# Initialize ResNet50 model for image feature extraction
resnet_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze all layers except the last 10
for layer in resnet_model.layers[:-10]:
    layer.trainable = False

# Initialize RoBERTa-large tokenizer and model
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
roberta_model = TFRobertaModel.from_pretrained('roberta-large')

# Image preprocessing and augmentation
def preprocess_image(img):
    img = tf.image.resize(img, (224, 224))
    img = tf.image.random_flip_left_right(img)  # Augmentation: Flip
    img = tf.image.random_brightness(img, 0.2)  # Augmentation: Brightness adjustment
    img = np.expand_dims(img, axis=0)
    return img

# Feature extraction: image (ResNet50) and text (RoBERTa)
def process_image_text(sample):
    img = sample['image']
    img_array = preprocess_image(img)
    
    # Extract image features using ResNet50
    img_features = resnet_model.predict(img_array)
    img_features = img_features.reshape((img_features.shape[0], -1))  # Flatten the image features
    
    # Tokenize and encode the question using RoBERTa
    question = sample['question']
    inputs = roberta_tokenizer.encode_plus(question, max_length=512, return_attention_mask=True, return_tensors='tf')
    outputs = roberta_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    text_embeddings = outputs.last_hidden_state[:, 0, :]  # Extract the CLS token

    # Fuse the image features and text embeddings
    combined_features = tf.concat([img_features, text_embeddings], axis=1)
    return combined_features

# Process dataset and convert to features
def extract_features(dataset_split):
    features = []
    for sample in dataset_split:
        features.append(process_image_text(sample))
    return np.array(features)

# Extract features for train and test datasets
train_features = extract_features(dataset['train'])
test_features = extract_features(dataset['test'])

# Convert labels ("yes"/"no" to binary)
train_labels = np.array([1 if answer == 'yes' else 0 for answer in dataset['train']['answer']])
test_labels = np.array([1 if answer == 'yes' else 0 for answer in dataset['test']['answer']])
train_labels_cat = to_categorical(train_labels, num_classes=2)
test_labels_cat = to_categorical(test_labels, num_classes=2)

# Define the VQA classification model
class VQAModel(tf.keras.Model):
    def __init__(self):
        super(VQAModel, self).__init__()
        self.dense1 = tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=l2(0.01))
        self.dropout = Dropout(0.5)  # Dropout for regularization
        self.flatten = Flatten()  # Flatten the input before the final dense layer
        self.dense2 = tf.keras.layers.Dense(2, activation='softmax')  # Binary output (yes/no)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dropout(x)  # Apply dropout during training
        x = self.flatten(x)  # Flatten to ensure output shape matches the target shape
        return self.dense2(x)

# Initialize and compile the model with a smaller learning rate
model = VQAModel()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

# Train the model with 20 epochs
model.fit(train_features, train_labels_cat, epochs=20, batch_size=32, validation_data=(test_features, test_labels_cat))

# Evaluate the model
loss, accuracy = model.evaluate(test_features, test_labels_cat)
print(f'Test Loss: {loss:.3f}')
print(f'Test Accuracy: {accuracy:.3f}')

# Predict and display classification report
predictions = model.predict(test_features)
predicted_classes = np.argmax(predictions, axis=1)

from sklearn.metrics import classification_report, confusion_matrix
print('Classification Report:')
print(classification_report(test_labels, predicted_classes))
print('Confusion Matrix:')
print(confusion_matrix(test_labels, predicted_classes))

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.embeddings.position_ids', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 637ms/step


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53

In [17]:
print(f'Test Loss: {loss:.3f}')
print(f'Test Accuracy: {accuracy:.3f}')

Test Loss: 1.037
Test Accuracy: 0.703


tf.squeeze(outputs, axis=1) in the VQAModel.call() function to match the target's shape.

In [18]:
# Save the model
model.save('vqa_model_roberta.h5')


