In [None]:
import requests
from PIL import Image
from transformers import Idefics2Processor, Idefics2ForConditionalGeneration
from accelerate import Accelerator
import torch

CACHE_DIR = "./cache"
device = Accelerator().device
processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", cache_dir=CACHE_DIR)
model = Idefics2ForConditionalGeneration.from_pretrained("HuggingFaceM4/idefics2-8b", cache_dir=CACHE_DIR)
model.to(device)

In [None]:
url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg"

image_1 = Image.open(requests.get(url_1, stream=True).raw)
image_2 = Image.open(requests.get(url_2, stream=True).raw)
images = [image_1, image_2]

messages = [{
    "role": "user",
    "content": [
        {"type": "text", "text": "What’s the difference between these two images?"},
        {"type": "image"},
        {"type": "image"},
    ],
}]

# at inference time, one needs to pass `add_generation_prompt=True` in order to make sure the model completes the prompt
text = processor.apply_chat_template(messages, add_generation_prompt=True)
print(text) # 'User: What’s the difference between these two images?<image><image><end_of_utterance>\nAssistant:'

inputs = processor(images=images, text=text, return_tensors="pt").to(model.device)
generated_text = model.generate(**inputs, max_new_tokens=500)
generated_text = processor.batch_decode(generated_text, skip_special_tokens=True)[0]
print("Generated text:", generated_text) # Result from model