In [None]:
# pip install transformers==4.56.2
# pip install vllm==0.9.1 blobfile flash-attn --no-build-isolation
# pip install git+https://github.com/huggingface/transformers.git

In [1]:
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True
)

CACHE_DIR = "./cache"
model_path = "moonshotai/Kimi-VL-A3B-Thinking"
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
    # cache_dir=CACHE_DIR,
)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [2]:
image_paths = ["/root/VLM-high-resolution/output_visualization_box.png", "/root/VLM-high-resolution/output_visualization_points.png"]
images = [Image.open(path) for path in image_paths]
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image_path} for image_path in image_paths
        ] + [{"type": "text", "text": "Please infer step by step who this manuscript belongs to and what it records"}],
    },
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
inputs = processor(images=images, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device)
generated_ids = model.generate(**inputs, max_new_tokens=32768, temperature=0.8)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)


◁think▷Okay, let's try to figure out who this manuscript belongs to and what it's recording. The user provided two images of a Corgi lying on grass. Both images are similar, so maybe the task is to determine the owner and the content of the manuscript based on these images.

First, I need to look for any clues in the images. The Corgi is wearing a red collar. The collar might have a tag or something that could indicate ownership. But the images don't show any clear markings like a name tag or ID. Maybe the collar's color or design could hint at something. Alternatively, the background might have some info. The setting is a grassy field with trees in the distance, which is a common place for photos. But how does that help?

Wait, the user mentioned a "manuscript." Maybe the term here is being used in a non-traditional sense, like a journal or a log. So perhaps this is a daily diary entry or a record of an event involving the Corgi. The two images might be from different days or differen