In [None]:
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
from datasets import load_dataset
import random

In [None]:
# Load the ROCO-radiology dataset
ds = load_dataset("mdwiratathya/ROCO-radiology")

# Randomly sample 15% of the dataset
sampled_size = int(len(ds['train']) * 0.15)
sample_indices = random.sample(range(len(ds['train'])), sampled_size)
sampled_data = [ds['train'][i] for i in sample_indices]

In [None]:
# Load LLaMA model and processor
model_id = "meta-llama/Llama-3.2-11B-Vision"
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,  # Adjust dtype as per your GPU capability
    device_map="auto",  # Automatically choose the device
)
processor = AutoProcessor.from_pretrained(model_id)

In [None]:
def preprocess_image(img):
    # Convert to PIL Image if not already
    if not isinstance(img, Image.Image):
        img = Image.fromarray(img)
    
    # Check the size of the image
    width, height = img.size
    print(f"Original image size: {width}x{height}")

    # Convert the image to RGB if necessary
    if img.mode != 'RGB':
        img = img.convert('RGB')

    # Resize image to (224, 224) for model input
    img = img.resize((224, 224))
    return img

# Loop through the sampled data and generate descriptions
for sample in sampled_data:
    img = sample['image']  # Get the image
    caption = sample['caption']  # Get the caption

    # Preprocess the image
    img_preprocessed = preprocess_image(img)

    # Prepare prompt
    prompt = f"<|image|><|begin_of_text|>{caption}"

    # Process image and prompt for model input
    inputs = processor(images=img_preprocessed, text=prompt, return_tensors="pt").to(model.device)

    # Generate output from the model
    output = model.generate(**inputs, max_new_tokens=30)
    
    # Decode the output
    description = processor.decode(output[0])
    
    print(f"Generated Description: {description}")

In [None]:
# Save the trained model
model.save_pretrained("./llama_trained_model")

# Save the processor
processor.save_pretrained("./llama_trained_processor")

In [None]:
# Print model summary
print(model.config)

In [None]:
# Move the model to CPU before saving
model = model.to('cpu')
# Save the trained model
model.save_pretrained("./llama_trained_model")
# Save the processor
processor.save_pretrained("./llama_trained_processor")

In [None]:
# Save the trained model with reduced shard size
model.save_pretrained("./llama_trained_model", max_shard_size="1GB")
# Save the processor as usual
processor.save_pretrained("./llama_trained_processor")

# LOAD

In [None]:
from transformers import MllamaForConditionalGeneration, AutoProcessor

# Load the processor
processor = AutoProcessor.from_pretrained("./llama_trained_processor")

# Load the model
model = MllamaForConditionalGeneration.from_pretrained("./llama_trained_model")

In [None]:
from google.colab import files
from PIL import Image
import io

uploaded = files.upload()
image = Image.open(io.BytesIO(uploaded[list(uploaded.keys())[0]]))
# preprocess image
img_preprocessed = preprocess_image(image)

prompt = "<|image|><|begin_of_text|>"

# Process image and prompt for model input
inputs = processor(images=img_preprocessed, text=prompt, return_tensors="pt").to(model.device)
# Generate output from the model
output = model.generate(**inputs, max_new_tokens=30)
# Decode the output
description = processor.decode(output[0])

In [None]:
import matplotlib.pyplot as plt
plt.title(f"Generated Description: {description}")
plt.imshow(image)
plt.show()