In [None]:
import os
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

dataset_dir = 'RISCM'
images_dir = os.path.join(dataset_dir, 'resized')
captions_path = os.path.join(dataset_dir, 'captions.csv')
model_name = "google/paligemma-3b-pt-224"
hf_token = " " # Enter huggingface token for permission
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv(captions_path)
df.columns = df.columns.str.strip()

def load_image(image_filename):
    image_path = os.path.join(images_dir, image_filename.strip())
    return Image.open(image_path).convert('RGB')

image_index = 5  # Select image by index

if image_index < len(df):
    row = df.iloc[image_index]
    img = load_image(row['image'])
    captions = [row[f'caption_{i}'].strip() for i in range(1, 6)]

    plt.figure(figsize=(3, 3))
    plt.imshow(img)
    plt.axis('off')
    plt.title("\n".join(captions), fontsize=10)
    plt.tight_layout()
    plt.show()

In [None]:
processor = AutoProcessor.from_pretrained(model_name, token=hf_token)
model = AutoModelForImageTextToText.from_pretrained(model_name, token=hf_token).to(device)

In [None]:
row = df.iloc[image_index]
img = load_image(row['image'])

# Print the image
plt.figure(figsize=(3, 3))
plt.imshow(img)
plt.axis('off')
plt.title(f"Image: {row['image']}", fontsize=10)
plt.tight_layout()
plt.show()

input_text = "<image>describe this image in detail\n"
input_text = "<image>Describe this image in detail\n"
inputs = processor(text=input_text, images=img, return_tensors="pt", padding="longest", do_convert_rgb=True).to(device)
inputs = inputs.to(dtype=model.dtype)

# Generate output using the model
with torch.no_grad():
    output = model.generate(**inputs, max_new_tokens=64)
result = processor.decode(output[0], skip_special_tokens=True)
print(f"\nPaligemma output: {result}")
