In [None]:
import requests
from PIL import Image
from transformers import Idefics2Processor, Idefics2ForConditionalGeneration, BitsAndBytesConfig
from accelerate import Accelerator
import torch

CACHE_DIR = "./cache"
device = Accelerator().device

# The idefics2-8b model loaded on an RTX 3090 with 23GB of memory is insufficient 
# and needs to be loaded at a lower level, such as 4-bit, to run.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True
)

processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", cache_dir=CACHE_DIR)
model = Idefics2ForConditionalGeneration.from_pretrained("HuggingFaceM4/idefics2-8b", cache_dir=CACHE_DIR, quantization_config=bnb_config)
model.to(device)

  from .autonotebook import tqdm as notebook_tqdm
Chat templates should be in a 'chat_template.jinja' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
Loading checkpoint shards: 100%|██████████| 7/7 [01:27<00:00, 12.44s/it]


Idefics2ForConditionalGeneration(
  (model): Idefics2Model(
    (vision_model): Idefics2VisionTransformer(
      (embeddings): Idefics2VisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(4900, 1152)
      )
      (encoder): Idefics2Encoder(
        (layers): ModuleList(
          (0-26): 27 x Idefics2EncoderLayer(
            (self_attn): Idefics2VisionAttention(
              (k_proj): Linear4bit(in_features=1152, out_features=1152, bias=True)
              (v_proj): Linear4bit(in_features=1152, out_features=1152, bias=True)
              (q_proj): Linear4bit(in_features=1152, out_features=1152, bias=True)
              (out_proj): Linear4bit(in_features=1152, out_features=1152, bias=True)
            )
            (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (mlp): Idefics2VisionMLP(
              (activation_fn): PytorchGELUTanh()
              

In [2]:
url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg"

image_1 = Image.open(requests.get(url_1, stream=True).raw)
image_2 = Image.open(requests.get(url_2, stream=True).raw)
images = [image_1, image_2]

messages = [{
    "role": "user",
    "content": [
        {"type": "text", "text": "What’s the difference between these two images?"},
        {"type": "image"},
        {"type": "image"},
    ],
}]

# at inference time, one needs to pass `add_generation_prompt=True` in order to make sure the model completes the prompt
text = processor.apply_chat_template(messages, add_generation_prompt=True)
print(text) # 'User: What’s the difference between these two images?<image><image><end_of_utterance>\nAssistant:'

inputs = processor(images=images, text=text, return_tensors="pt").to(model.device)
generated_text = model.generate(**inputs, max_new_tokens=500)
generated_text = processor.batch_decode(generated_text, skip_special_tokens=True)[0]
print("Generated text:", generated_text) # Result from model

User: What’s the difference between these two images?<image><image><end_of_utterance>
Assistant:
Generated text: User: What’s the difference between these two images? 
Assistant: The cats and dogs are snuggling together on the couch.
