In [1]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
from accelerate import dispatch_model, infer_auto_device_map
import gc

print(f"Available GPUs: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

torch.cuda.empty_cache()
gc.collect()

processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")

model = LlavaNextForConditionalGeneration.from_pretrained(
    "llava-hf/llava-v1.6-mistral-7b-hf",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto",  
    max_memory={0: "13GB", 1: "13GB"} 
)

# Alternative: Manual device mapping if auto doesn't work well
# model = LlavaNextForConditionalGeneration.from_pretrained(
#     "llava-hf/llava-v1.6-mistral-7b-hf",
#     torch_dtype=torch.float16,
#     low_cpu_mem_usage=True,
# )
# device_map = infer_auto_device_map(model, max_memory={0: "13GB", 1: "13GB"})
# model = dispatch_model(model, device_map=device_map)

print("Model device map:")
for name, module in model.named_modules():
    if hasattr(module, 'weight') and hasattr(module.weight, 'device'):
        print(f"{name}: {module.weight.device}")

from PIL import Image
import requests

url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
image = Image.open(requests.get(url, stream=True).raw)
prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"

inputs = processor(prompt, image, return_tensors="pt")

with torch.no_grad():
    output = model.generate(
        **inputs, 
        max_new_tokens=100,
        do_sample=False,
        pad_token_id=processor.tokenizer.eos_token_id
    )

response = processor.decode(output[0], skip_special_tokens=True)
print(response)

for i in range(torch.cuda.device_count()):
    print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB / {torch.cuda.max_memory_allocated(i) / 1024**3:.2f} GB")

2025-08-25 06:42:32.190104: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756104152.513277      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756104152.606106      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Available GPUs: 2
GPU 0: Tesla T4
GPU 1: Tesla T4


preprocessor_config.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/176 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Model device map:
model.vision_tower.vision_model.embeddings.patch_embedding: cuda:0
model.vision_tower.vision_model.embeddings.position_embedding: cuda:0
model.vision_tower.vision_model.pre_layrnorm: cuda:0
model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj: cuda:0
model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj: cuda:0
model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj: cuda:0
model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj: cuda:0
model.vision_tower.vision_model.encoder.layers.0.layer_norm1: cuda:0
model.vision_tower.vision_model.encoder.layers.0.mlp.fc1: cuda:0
model.vision_tower.vision_model.encoder.layers.0.mlp.fc2: cuda:0
model.vision_tower.vision_model.encoder.layers.0.layer_norm2: cuda:0
model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj: cuda:0
model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj: cuda:0
model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj: cuda:0
mode

You may have used the wrong order for inputs. `images` should be passed before `text`. The `images` and `text` inputs will be swapped. This behavior will be deprecated in transformers v4.47.


[INST]  
What is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multivariate chart that displays values for multiple variables represented on axes starting from the same point. This particular radar chart is showing the performance of different models or systems across various metrics.

The axes represent different metrics or benchmarks, such as MM-Vet, MM-Vet, MM-Vet, MM-Vet, MM-Vet, MM-V
GPU 0 memory: 6.95 GB / 7.43 GB
GPU 1 memory: 7.16 GB / 7.61 GB


In [2]:
url = "https://static.toiimg.com/thumb/msid-113367147,width-1280,height-720,resizemode-72/113367147.jpg"
image = Image.open(requests.get(url, stream=True).raw)
prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"

inputs = processor(prompt, image, return_tensors="pt")

with torch.no_grad(): 
    output = model.generate(
        **inputs, 
        max_new_tokens=100,
        do_sample=False,
        pad_token_id=processor.tokenizer.eos_token_id
    )

response = processor.decode(output[0], skip_special_tokens=True)
print(response)

for i in range(torch.cuda.device_count()):
    print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB / {torch.cuda.max_memory_allocated(i) / 1024**3:.2f} GB")

[INST]  
What is shown in this image? [/INST] The image depicts an illustration of a classroom scene. A teacher is standing in front of a group of students, holding a stick or a rod, which is often used as a teaching tool in some educational settings. The students appear to be listening attentively to the teacher. The teacher's expression suggests they might be explaining something or giving instructions. The classroom setting is typical, with desks and chairs visible, and the students are dressed in casual attire. The image is likely meant
GPU 0 memory: 6.95 GB / 7.43 GB
GPU 1 memory: 7.16 GB / 7.61 GB
