In [1]:
# 1. Library and Dataset Loading
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image as keras_image
from tensorflow.keras.applications.resnet50 import preprocess_input as preprocess_resnet_input
from tensorflow.keras.utils import to_categorical
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("flaviagiammarino/vqa-rad")

# Print dataset info
print("Dataset loaded successfully.")
print(f"Number of training samples: {len(dataset['train'])}")
print(f"Number of test samples: {len(dataset['test'])}")

Found cached dataset parquet (/Users/hemang/.cache/huggingface/datasets/flaviagiammarino___parquet/flaviagiammarino--vqa-rad-d04980c9c3579419/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset loaded successfully.
Number of training samples: 1793
Number of test samples: 451


In [2]:
#2. Preprocessing
# Data transformation
def preprocess_image(image):
    # Resize and normalize image
    image = image.resize((224, 224))
    image_array = np.array(image)
    image_array = preprocess_resnet_input(image_array)
    return image_array

# Sample preprocessing
sample_image = dataset['train'][0]['image']
preprocessed_image = preprocess_image(sample_image)
print(f"Sample image size before preprocessing: {sample_image.size}")
print(f"Sample image tensor shape after preprocessing: {preprocessed_image.shape}")

Sample image size before preprocessing: (566, 555)
Sample image tensor shape after preprocessing: (224, 224, 3)


In [3]:
#3. Tokenizing
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_text(text):
    inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True)
    return inputs['input_ids'], inputs['attention_mask']

# Sample tokenizing
sample_question = dataset['train'][0]['question']
input_ids, attention_mask = tokenize_text(sample_question)
print(f"Sample input IDs: {input_ids}")
print(f"Sample attention mask: {attention_mask}")

Sample input IDs: [[   0 1322 3806    9    5 2900 4047  271 3894  196  116    2]]
Sample attention mask: [[1 1 1 1 1 1 1 1 1 1 1 1]]


In [4]:
#4. Feature Extraction
# Define the models
resnet_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

def extract_features(image, text):
    # Extract image features
    image_features = resnet_model.predict(np.expand_dims(image, axis=0))
    
    # Reshape to match the expected input of the model (if needed)
    image_features = image_features.reshape((image_features.shape[0], -1))
    
    # Extract text features
    text_features = roberta_model(input_ids=text[0], attention_mask=text[1]).pooler_output

    return image_features, text_features


# Sample feature extraction
image_features, text_features = extract_features(preprocessed_image, (input_ids, attention_mask))
print(f"Image features shape: {image_features.shape}")
print(f"Text features shape: {text_features.shape}")

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.
2024-09-07 18:57:47.509345: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Image features shape: (1, 2048)
Text features shape: (1, 768)


In [5]:
#5. Feature Fusion
def fuse_features(image_features, text_features):
    combined_features = tf.concat([image_features, text_features], axis=1)
    return combined_features

# Sample feature fusion
fused_features = fuse_features(image_features, text_features)
print(f"Fused features shape: {fused_features.shape}")

Fused features shape: (1, 2816)


In [6]:
#6. Update Label Mapping and Identifying Unique Labels
# Assuming answer labels are already in the dataset
answers = [item['answer'] for item in dataset['train']]
unique_labels = list(set(answers))
num_classes = len(unique_labels)

# Create label to index mapping
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
index_to_label = {idx: label for label, idx in label_to_index.items()}

# Convert answers to indices
def convert_answers_to_indices(answers):
    return np.array([label_to_index[answer] for answer in answers])

# Sample conversion
sample_answers = [dataset['train'][0]['answer']]
answer_indices = convert_answers_to_indices(sample_answers)
print(f"Unique labels: {unique_labels}")
print(f"Answer indices: {answer_indices}")


Unique labels: ['suprasellar', 'mri-dwi', 't5', 'lateral ventricles', 'pa', 'xray - plain film', '4', 'lower lung fields', 'large bowel', 'ribs', 'hyperintensity of the left basal ganglia', 'cartilage is not well viewed by x rays', 'micronodular', 'right lung', 'infarcts', 'base', '6.5 x 6.2 x 8.8cm', 'loculated', 'lung markings present all the way laterally to the ribs', 'maybe', 'tumors, gallstones', 'upper lobes', 'nephroblastomatosis', 'chest xray', 'right sided aortic arch', 'ring-enhancing', 'portal vein occlusion', 't2 weighted', 'small subdural hematoma with cerebral edema', 'hypodense', 'the pancreas', 'sternal wires', 'nodules', 'omental caking', 'right lobe', 'in the bowel', 'right frontal lobe', '12', 'double arch', 'below the 7th rib in the right lung.', 'the brain', 'ultrasound', 'right mca', 'cardiovascular', 'the aorta and the inferior vena cava', 'underneath the right hemidiaphragm', 'cystic duct is more tortuous', 'abnormal hyperintensity in the right occipital lobe',

In [7]:
#7. Data Loader
# Create a TensorFlow dataset
def create_tf_dataset(dataset, batch_size=16):
    def gen():
        for item in dataset:
            image = preprocess_image(item['image'])
            text = tokenize_text(item['question'])
            image_features, text_features = extract_features(image, text)
            label = label_to_index[item['answer']]
            yield (image_features, text_features), label

    dataset = tf.data.Dataset.from_generator(gen, 
                                             output_signature=(
                                                 (tf.TensorSpec(shape=(1, 2048), dtype=tf.float32),
                                                  tf.TensorSpec(shape=(1, 768), dtype=tf.float32)),
                                                 tf.TensorSpec(shape=(), dtype=tf.int32)
                                             )
                                            )
    return dataset.batch(batch_size)

train_dataset_tf = create_tf_dataset(dataset['train'])
test_dataset_tf = create_tf_dataset(dataset['test'])

In [8]:
#8. Model Definition
# Model Definition
class VQARADModel(tf.keras.Model):
    def __init__(self, num_classes):
        super(VQARADModel, self).__init__()
        self.fc_image = tf.keras.layers.Dense(512, activation='relu')  # Project image features to match text features
        self.fc_text = tf.keras.layers.Dense(512, activation='relu')   # Project text features to match image features
        self.fc_final = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, inputs):
        image_features, text_features = inputs

        # Process features
        image_features = self.fc_image(image_features)
        text_features = self.fc_text(text_features)

        # Concatenate along the last axis
        combined_features = tf.concat([image_features, text_features], axis=1)
        output = self.fc_final(combined_features)

        return output

: 

In [9]:
#9. Training and Evaluation

# Instantiate and Compile the Model
num_classes = len(unique_labels)
model = VQARADModel(num_classes=num_classes)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Fit the Model
history = model.fit(train_dataset_tf, epochs=10, validation_data=test_dataset_tf)

# Print training history
print("Training completed.")
print(f"Final training accuracy: {history.history['accuracy'][-1]}")
print(f"Final validation accuracy: {history.history['val_accuracy'][-1]}")

Epoch 1/10


In [None]:
#10. Validation
# Evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate(test_dataset_tf)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")