In [1]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

DEVICE = "cuda"


import torch, random, numpy as np
from transformers import set_seed

def set_all_seeds(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    set_seed(seed)
    torch.backends.cudnn.deterministic = True

set_all_seeds(9)

from safetensors import safe_open

In [8]:
# rust
# cargo run -p smol_vlm --features cuda -- -i .vscode/angela-porter-2021-jan-25.jpg -p "Can you describe the image?" --sample-length 500
# cargo run -p smol_vlm --features cuda -- -p "A real-valued function f defined on the real line is called an even function if f(-t) = f(t) for each real number t. Prove that the set of even functions defined on the real line with the operations of addition and scalar multiplication defined in Example 3 is a vector space." --sample-length 200
# cargo run -p smol_vlm --features cuda -- -p "Can you describe the image?" --sample-length 500
# cargo run -p smol_vlm --features cuda -- -p "What is life?" --sample-length 500

# Load three images from local paths
img = ['/home/ahc/Documents/kornia-rs/.vscode/featured-stovetop-burgers-recipe-300x\
300.jpg', '/home/ahc/Documents/kornia-rs/.vscode/witness-raw-beauty-majestic-mountain-landscape-meticulously\
-captured-to-showcase-intricate-details-nature-s-artistry-349750953.jpg', '/home/ahc/Documents/kornia-rs/.vs\
code/0965b93f250920c1ee8a9a2b0ba0c291.jpg']
image1 = Image.open(img[0])
image2 = Image.open(img[1])
image3 = Image.open(img[2])  # Replace with your third image path

print(f"Image 1 size: {image1.size}")
print(f"Image 2 size: {image2.size}")
print(f"Image 3 size: {image3.size}")

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "image"},
            # {"type": "image"},
            {"type": "text", "text": "Can you describe the two images?"},
        ]
    },
]

Image 1 size: (300, 300)
Image 2 size: (800, 450)
Image 3 size: (474, 474)


In [3]:
# Initialize model directly on CUDA without Flash Attention
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.bfloat16,
    # _attn_implementation="flash_attention_2",  # Commented out Flash Attention
    device_map="cuda",
)
model.eval();

In [9]:
# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image1, image2], return_tensors="pt")
inputs = inputs.to("cuda")

print(inputs["input_ids"])
# Generate outputs
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=500,
        # repition_penalty=1.1,  # Apply repeat penalty
        output_scores=True,           # Return logits for each generated token
        return_dict_in_generate=True, # Return detailed output object
        do_sample=False,  # Use greedy decoding (highest logit)
    )

outputs.sequences[0]

tensor([[    1, 11126,    42,  ...,  9519,  9531,    42]], device='cuda:0')


tensor([    1, 11126,    42,  ..., 20232,    30, 49154], device='cuda:0')

In [10]:
processor.tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

'User:<row_1_col_1><row_1_col_2><row_1_col_3><row_1_col_4>\n<row_2_col_1><row_2_col_2><row_2_col_3><row_2_col_4>\n<row_3_col_1><row_3_col_2><row_3_col_3><row_3_col_4>\n<row_4_col_1><row_4_col_2><row_4_col_3><row_4_col_4>\n\n<global-img><row_1_col_1><row_1_col_2><row_1_col_3><row_1_col_4>\n<row_2_col_1><row_2_col_2><row_2_col_3><row_2_col_4>\n<row_3_col_1><row_3_col_2><row_3_col_3><row_3_col_4>\n\n<global-img>Can you describe the two images?\nAssistant: The first image shows a close-up of a burger with lettuce, tomato, and cheese on a sesame seed bun. The burger is on a white plate with a bowl of pickles and a wooden table. The second image shows a mountain range with snow-capped peaks and green valleys.'

In [None]:
"""
'User: Can you describe these three images?
      <row_1_col_1><row_1_col_2><row_1_col_3><row_1_col_4>
    \n<row_2_col_1><row_2_col_2><row_2_col_3><row_2_col_4>
    \n<row_3_col_1><row_3_col_2><row_3_col_3><row_3_col_4>
    \n<row_4_col_1><row_4_col_2><row_4_col_3><row_4_col_4>
    \n\n<global-img>
    
      <row_1_col_1><row_1_col_2><row_1_col_3><row_1_col_4>
    \n<row_2_col_1><row_2_col_2><row_2_col_3><row_2_col_4>
    \n<row_3_col_1><row_3_col_2><row_3_col_3><row_3_col_4>
    \n\n<global-img>
    
      <row_1_col_1><row_1_col_2><row_1_col_3><row_1_col_4>
    \n<row_2_col_1><row_2_col_2><row_2_col_3><row_2_col_4>
    \n<row_3_col_1><row_3_col_2><row_3_col_3><row_3_col_4>
    \n<row_4_col_1><row_4_col_2><row_4_col_3><row_4_col_4>
    \n\n<global-img>
\nAssistant: The image is an anime-style drawing of a pair of feet wearing Nike brand sneakers.
The feet are in the air, and the person is wearing blue jeans. The sky is blue with white clouds,
and the sun is shining.'
"""

"\n'User: Can you describe these three images?\n      <row_1_col_1><row_1_col_2><row_1_col_3><row_1_col_4>\n    \n<row_2_col_1><row_2_col_2><row_2_col_3><row_2_col_4>\n    \n<row_3_col_1><row_3_col_2><row_3_col_3><row_3_col_4>\n    \n<row_4_col_1><row_4_col_2><row_4_col_3><row_4_col_4>\n    \n\n<global-img><row_1_col_1><row_1_col_2><row_1_col_3><row_1_col_4>\n\n    \n<row_2_col_1><row_2_col_2><row_2_col_3><row_2_col_4>\n    \n<row_3_col_1><row_3_col_2><row_3_col_3><row_3_col_4>\n    \n\n<global-img><row_1_col_1><row_1_col_2><row_1_col_3><row_1_col_4>\n\n    \n<row_2_col_1><row_2_col_2><row_2_col_3><row_2_col_4>\n    \n<row_3_col_1><row_3_col_2><row_3_col_3><row_3_col_4>\n<row_4_col_1><row_4_col_2><row_4_col_3><row_4_col_4>\n\n<global-img>\n\nAssistant: The image is an anime-style drawing of a pair of feet wearing Nike brand sneakers.\nThe feet are in the air, and the person is wearing blue jeans. The sky is blue with white clouds,\nand the sun is shining.'\n"