In [None]:
import requests
import torch
import json
import os
from tqdm import tqdm
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor

device = "cuda"

model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model.to(device)
model.bfloat16()
model.eval()

processor = AutoProcessor.from_pretrained(model_id)


def run_inference(image_path, question):
    image = Image.open(image_path)
    
    messages = [
        {"role": "user", "content": [
            {"type": "image", "image": image_path},
            {"type": "text", "text": question},
            {"type": "text", "text": "Give your answer in a crisp manner. Do not add any preamble or postamble."}
        ]}
    ]
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(image, input_text, return_tensors="pt").to(model.device)

    output = model.generate(**inputs, max_new_tokens=30)
    
    result = processor.decode(output[0])
    
    
    return result