# RoBERTa

In [None]:
import numpy as np
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2  # Import l2 regularizer
from datasets import load_dataset
from collections import Counter

# Load the dataset
dataset = load_dataset("flaviagiammarino/vqa-rad")

# Initialize ResNet50 model for image feature extraction
resnet_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze all layers except the last 10
for layer in resnet_model.layers[:-10]:
    layer.trainable = False

# Initialize RoBERTa-large tokenizer and model
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
roberta_model = TFRobertaModel.from_pretrained('roberta-large')

# Function to build vocabulary from the training set questions
def build_vocabulary(dataset_split):
    vocab = Counter()
    for sample in dataset_split:
        question = sample['question']
        tokens = roberta_tokenizer.tokenize(question)
        vocab.update(tokens)
    return vocab

# Build vocabulary using the training set
vocab = build_vocabulary(dataset['train'])
print(f"Vocabulary Size: {len(vocab)}")

# Image preprocessing and augmentation
def preprocess_image(img):
    img = tf.image.resize(img, (224, 224))
    img = tf.image.random_flip_left_right(img)  # Augmentation: Flip
    img = tf.image.random_brightness(img, 0.2)  # Augmentation: Brightness adjustment
    img = np.expand_dims(img, axis=0)
    return img

# Feature extraction: image (ResNet50) and text (RoBERTa)
def process_image_text(sample):
    img = sample['image']
    img_array = preprocess_image(img)
    
    # Extract image features using ResNet50
    img_features = resnet_model.predict(img_array)
    img_features = img_features.flatten()  # Flatten the image features
    
    # Tokenize and encode the question using RoBERTa
    question = sample['question']
    inputs = roberta_tokenizer.encode_plus(question, max_length=512, return_attention_mask=True, return_tensors='tf')
    outputs = roberta_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    text_embeddings = outputs.last_hidden_state[:, 0, :].numpy().flatten()  # Extract the CLS token and flatten
    
    # Fuse the image features and text embeddings
    combined_features = np.concatenate([img_features, text_embeddings])
    return combined_features

# Process dataset and convert to features
def extract_features(dataset_split):
    features = []
    for sample in dataset_split:
        features.append(process_image_text(sample))
    return np.array(features)

# Extract features for train and test datasets
train_features = extract_features(dataset['train'])
test_features = extract_features(dataset['test'])

# Convert labels ("yes"/"no" to binary)
train_labels = np.array([1 if answer == 'yes' else 0 for answer in dataset['train']['answer']])
test_labels = np.array([1 if answer == 'yes' else 0 for answer in dataset['test']['answer']])
train_labels_cat = to_categorical(train_labels, num_classes=2)
test_labels_cat = to_categorical(test_labels, num_classes=2)

# Define the VQA classification model using Functional API
input_layer = Input(shape=(train_features.shape[1],))  # Adjust the input shape to match the feature vector
x = Dense(512, activation='relu', kernel_regularizer=l2(0.01))(input_layer)  # Use l2 regularizer
x = Dropout(0.5)(x)  # Dropout for regularization
output_layer = Dense(2, activation='softmax')(x)  # Binary output (yes/no)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

# Train the model with 20 epochs
model.fit(train_features, train_labels_cat, epochs=20, batch_size=32, validation_data=(test_features, test_labels_cat))

# Evaluate the model
loss, accuracy = model.evaluate(test_features, test_labels_cat)
print(f'Test Loss: {loss:.3f}')
print(f'Test Accuracy: {accuracy:.3f}')

# Predict and display classification report
predictions = model.predict(test_features)
predicted_classes = np.argmax(predictions, axis=1)

from sklearn.metrics import classification_report, confusion_matrix
print('Classification Report:')
print(classification_report(test_labels, predicted_classes))
print('Confusion Matrix:')
print(confusion_matrix(test_labels, predicted_classes))

# Save the model as a Keras model
model.save('vqa_model_roberta.keras')

print(f'Test Loss: {loss:.3f}')
print(f'Test Accuracy: {accuracy:.3f}')

In [None]:
# Save the model
model.save('vqa_model_roberta.h5')
model.save('vqa_model_roberta.keras')

from keras.models import load_model

model =load_model('vqa_model_roberta.keras')
model.summary()

# Fine Tune

In [None]:
import numpy as np
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import LearningRateScheduler
from datasets import load_dataset
import matplotlib.pyplot as plt

# Load the dataset
dataset = load_dataset("flaviagiammarino/vqa-rad")

# Initialize RoBERTa-large tokenizer and model
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
roberta_model = TFRobertaModel.from_pretrained('roberta-large')

# Unfreeze all layers in RoBERTa for fine-tuning
roberta_model.trainable = True

# Initialize ResNet50 model for image feature extraction
resnet_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Unfreeze all layers in ResNet50 for fine-tuning
resnet_model.trainable = True

# Function to preprocess and augment the image
def preprocess_image(img):
    img = tf.image.resize(img, (224, 224))
    img = tf.image.random_flip_left_right(img)  # Augmentation: Flip
    img = tf.image.random_brightness(img, 0.2)  # Augmentation: Brightness adjustment
    img = np.expand_dims(img, axis=0)
    return img

# Feature extraction: image (ResNet50) and text (RoBERTa)
def process_image_text(sample):
    img = sample['image']
    img_array = preprocess_image(img)
    
    # Extract image features using ResNet50
    img_features = resnet_model.predict(img_array)
    img_features = img_features.flatten()  # Flatten the image features
    
    # Tokenize and encode the question using RoBERTa
    question = sample['question']
    inputs = roberta_tokenizer.encode_plus(question, max_length=512, return_attention_mask=True, return_tensors='tf')
    outputs = roberta_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    text_embeddings = outputs.last_hidden_state[:, 0, :].numpy().flatten()  # Extract the CLS token and flatten
    
    # Fuse the image features and text embeddings
    combined_features = np.concatenate([img_features, text_embeddings])
    return combined_features

# Process dataset and convert to features
def extract_features(dataset_split):
    features = []
    for sample in dataset_split:
        features.append(process_image_text(sample))
    return np.array(features)

# Extract features for train and test datasets
train_features = extract_features(dataset['train'])
test_features = extract_features(dataset['test'])

# Convert labels ("yes"/"no" to binary)
train_labels = np.array([1 if answer == 'yes' else 0 for answer in dataset['train']['answer']])
test_labels = np.array([1 if answer == 'yes' else 0 for answer in dataset['test']['answer']])
train_labels_cat = to_categorical(train_labels, num_classes=2)
test_labels_cat = to_categorical(test_labels, num_classes=2)

# Define the VQA classification model using Functional API
input_layer = Input(shape=(train_features.shape[1],))  # Adjust the input shape to match the feature vector
x = Dense(1024, activation='relu', kernel_regularizer=l2(0.01))(input_layer)
x = BatchNormalization()(x)  # Batch normalization for regularization
x = Dropout(0.5)(x)  # Dropout for regularization
x = Dense(512, activation='relu', kernel_regularizer=l2(0.01))(x)
x = Dropout(0.5)(x)  # Dropout for regularization
output_layer = Dense(2, activation='softmax')(x)  # Binary output (yes/no)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

# Learning rate scheduler
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return float(lr * tf.math.exp(-0.1))  # Convert tensor to float

lr_scheduler = LearningRateScheduler(scheduler)


# Train the model with 20 epochs and include learning rate scheduler
history = model.fit(train_features, train_labels_cat, epochs=20, batch_size=32, 
                    validation_data=(test_features, test_labels_cat), 
                    callbacks=[lr_scheduler])

# Evaluate the model
loss, accuracy = model.evaluate(test_features, test_labels_cat)
print(f'Test Loss: {loss:.3f}')
print(f'Test Accuracy: {accuracy:.3f}')

# Plot learning curve
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.show()

# Save the fine-tuned model as a Keras model
model.save('vqa_model_roberta_finetuned.keras')

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

# Define the VQA classification model using Functional API
input_layer = Input(shape=(train_features.shape[1],))
x = Dense(512, activation='relu', kernel_regularizer=l2(0.01))(input_layer)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)  # Dropout for regularization
x = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)  # Dropout for regularization
output_layer = Dense(2, activation='softmax')(x)  # Binary output (yes/no)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model with a smaller learning rate
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with 20 epochs
history = model.fit(train_features, train_labels_cat, epochs=20, batch_size=32, 
                    validation_data=(test_features, test_labels_cat), 
                    callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(test_features, test_labels_cat)
print(f'Test Loss: {loss:.3f}')
print(f'Test Accuracy: {accuracy:.3f}')

# Plot the learning curves
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Learning Curve - Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Learning Curve - Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()