In [None]:
!pip list

In [None]:
!pip install huggingface_hub transformers accelerate

In [1]:
import torch
import json
import requests
from PIL import Image
from transformers import AutoModelForCausalLM

In [2]:
MODEL_PATH = "AIDC-AI/Ovis2.5-2B"

# Thinking mode & budget
enable_thinking = False
enable_thinking_budget = False

max_new_tokens = 3072
thinking_budget = 2048

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
).cuda()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
!dir

bird.jpg  Ovis2.5-2B.ipynb


In [4]:
img = 'bird.jpg'

In [5]:
messages = [{
    "role": "user",
    "content": [
        {"type": "image", "image": Image.open('bird.jpg')},
        {"type": "text", "text": "Describe the image in 100 words"},
    ],
}]

input_ids, pixel_values, grid_thws = model.preprocess_inputs(
    messages=messages,
    add_generation_prompt=True,
    enable_thinking=enable_thinking
)
input_ids = input_ids.cuda()
pixel_values = pixel_values.cuda() if pixel_values is not None else None
grid_thws = grid_thws.cuda() if grid_thws is not None else None

outputs = model.generate(
    inputs=input_ids,
    pixel_values=pixel_values,
    grid_thws=grid_thws,
    enable_thinking=enable_thinking,
    enable_thinking_budget=enable_thinking_budget,
    max_new_tokens=max_new_tokens,
    thinking_budget=thinking_budget,
)

response = model.text_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


A hummingbird hovers near a vibrant flower, feeding on nectar. The bird's iridescent green feathers and long beak are prominent. The flower features bright orange and yellow petals, with some dried white ones at the bottom. The background is softly blurred, creating a warm, yellowish hue that highlights the subjects. Another similar flower is partially visible on the left.


In [1]:
!nvidia-smi

Thu Sep 18 17:08:09 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3080 Ti     Off | 00000000:01:00.0 Off |                  N/A |
|  0%   27C    P8               5W / 350W |      3MiB / 12288MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [10]:
cfg = model.config.to_dict()  
print(json.dumps(cfg, indent=2))

{
  "return_dict": true,
  "output_hidden_states": false,
  "output_attentions": false,
  "torchscript": false,
  "torch_dtype": "bfloat16",
  "use_bfloat16": false,
  "tf_legacy_loss": false,
  "pruned_heads": {},
  "tie_word_embeddings": true,
  "chunk_size_feed_forward": 0,
  "is_encoder_decoder": false,
  "is_decoder": false,
  "cross_attention_hidden_size": null,
  "add_cross_attention": false,
  "tie_encoder_decoder": false,
  "max_length": 20,
  "min_length": 0,
  "do_sample": false,
  "early_stopping": false,
  "num_beams": 1,
  "num_beam_groups": 1,
  "diversity_penalty": 0.0,
  "temperature": 1.0,
  "top_k": 50,
  "top_p": 1.0,
  "typical_p": 1.0,
  "repetition_penalty": 1.0,
  "length_penalty": 1.0,
  "no_repeat_ngram_size": 0,
  "encoder_no_repeat_ngram_size": 0,
  "bad_words_ids": null,
  "num_return_sequences": 1,
  "output_scores": false,
  "return_dict_in_generate": false,
  "forced_bos_token_id": null,
  "forced_eos_token_id": null,
  "remove_invalid_values": false,
  