# Qwen-VL-Chat Test (4B params)

Chat-optimized VLM with bilingual support and good performance.

**Model:** `Qwen/Qwen-VL-Chat`  
**Size:** 4B parameters  
**License:** Permissive  
**Features:** Chat-optimized, bilingual (English/Chinese), zero-shot object detection  
**Requirements:** ~8GB disk, ~4GB RAM/VRAM


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
from vlm_utils import get_device_info, load_test_images, display_image, print_section, print_subsection

device = get_device_info()


Using device: mps
PyTorch version: 2.9.1
Using Apple Silicon MPS (Metal Performance Shaders)


## Load Test Images


In [2]:
image_files = load_test_images()


Found 1 image(s) to test:
  - sample_image.jpg


## Load Qwen-VL-Chat Model


In [3]:
print("Loading Qwen-VL-Chat...")
model_id = "Qwen/Qwen-VL-Chat"

# Determine dtype based on device
# Note: Qwen-VL-Chat works best on CPU or CUDA, MPS has some issues
if device.type == 'mps':
    print("‚ö†Ô∏è  Using CPU for better stability (Qwen-VL has some MPS quirks)")
    actual_device = torch.device('cpu')
    model_dtype = torch.float32
else:
    actual_device = device
    use_float16 = torch.cuda.is_available()
    model_dtype = torch.float16 if use_float16 else torch.float32

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, revision="462bad4a9c44c1e216c6b29f9469e0563b4f413d")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    device_map=actual_device,
    dtype=model_dtype,
    revision="462bad4a9c44c1e216c6b29f9469e0563b4f413d"
).eval()
print("‚úì Qwen-VL-Chat loaded!")


Loading Qwen-VL-Chat...
‚ö†Ô∏è  Using CPU for better stability (Qwen-VL has some MPS quirks)


ValueError: Unrecognized model in Qwen/Qwen-VL-Chat. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: aimv2, aimv2_vision_model, albert, align, altclip, apertus, arcee, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, bitnet, blenderbot, blenderbot-small, blip, blip-2, blip_2_qformer, bloom, blt, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, cohere2_vision, colpali, colqwen2, conditional_detr, convbert, convnext, convnextv2, cpmant, csm, ctrl, cvt, d_fine, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deepseek_v2, deepseek_v3, deepseek_vl, deepseek_vl_hybrid, deformable_detr, deit, depth_anything, depth_pro, deta, detr, dia, diffllama, dinat, dinov2, dinov2_with_registers, dinov3_convnext, dinov3_vit, distilbert, doge, donut-swin, dots1, dpr, dpt, edgetam, edgetam_video, edgetam_vision_model, efficientformer, efficientloftr, efficientnet, electra, emu3, encodec, encoder-decoder, eomt, ernie, ernie4_5, ernie4_5_moe, ernie_m, esm, evolla, exaone4, falcon, falcon_h1, falcon_mamba, fastspeech2_conformer, fastspeech2_conformer_with_hifigan, flaubert, flava, flex_olmo, florence2, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, gemma3, gemma3_text, gemma3n, gemma3n_audio, gemma3n_text, gemma3n_vision, git, glm, glm4, glm4_moe, glm4v, glm4v_moe, glm4v_moe_text, glm4v_text, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gpt_oss, gptj, gptsan-japanese, granite, granite_speech, granitemoe, granitemoehybrid, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hgnet_v2, hiera, hubert, hunyuan_v1_dense, hunyuan_v1_moe, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, internvl, internvl_vision, jamba, janus, jetmoe, jukebox, kosmos-2, kosmos-2.5, kyutai_speech_to_text, layoutlm, layoutlmv2, layoutlmv3, led, levit, lfm2, lfm2_vl, lightglue, lilt, llama, llama4, llama4_text, llava, llava_next, llava_next_video, llava_onevision, longcat_flash, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, metaclip_2, mgp-str, mimi, minimax, ministral, mistral, mistral3, mixtral, mlcd, mllama, mm-grounding-dino, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, modernbert-decoder, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmo3, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, ovis2, owlv2, owlvit, paligemma, parakeet_ctc, parakeet_encoder, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, perception_encoder, perception_lm, persimmon, phi, phi3, phi4_multimodal, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prompt_depth_anything, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_omni, qwen2_5_vl, qwen2_5_vl_text, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, qwen2_vl_text, qwen3, qwen3_moe, qwen3_next, qwen3_omni_moe, qwen3_vl, qwen3_vl_moe, qwen3_vl_moe_text, qwen3_vl_text, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, sam2, sam2_hiera_det_model, sam2_video, sam2_vision_model, sam_hq, sam_hq_vision_model, sam_vision_model, seamless_m4t, seamless_m4t_v2, seed_oss, segformer, seggpt, sew, sew-d, shieldgemma2, siglip, siglip2, siglip2_vision_model, siglip_vision_model, smollm3, smolvlm, smolvlm_vision, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, t5gemma, table-transformer, tapas, textnet, time_series_transformer, timesfm, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, vaultgemma, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, vjepa2, voxtral, voxtral_encoder, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xcodec, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xlstm, xmod, yolos, yoso, zamba, zamba2, zoedepth

## Define Inference Function


In [None]:
def describe_image(image_path, prompt="Describe this image in detail."):
    """Generate description for an image using Qwen-VL-Chat."""
    # Qwen-VL-Chat uses a special format with <img>path</img> tags
    query = tokenizer.from_list_format([
        {'image': str(image_path)},
        {'text': prompt},
    ])
    response, history = model.chat(tokenizer, query=query, history=None)
    return response


## Test on All Images


In [None]:
for image_path in image_files:
    print_section(f"Image: {image_path.name}")
    
    display_image(image_path)
    
    print_subsection("üîç Qwen-VL-Chat Description:")
    try:
        desc = describe_image(image_path)
        print(desc)
    except Exception as e:
        print(f"Error: {e}")


## Custom Prompts

Try asking specific questions about an image.


In [None]:
if image_files:
    test_image = image_files[0]
    
    custom_prompts = [
        "What objects can you see in this image?",
        "What colors are prominent in this image?",
        "What is the main subject of this image?"
    ]
    
    print_section(f"Custom Prompts - {test_image.name}")
    display_image(test_image)
    
    for prompt in custom_prompts:
        print_subsection(f"Q: {prompt}")
        answer = describe_image(test_image, prompt)
        print(f"A: {answer}")
