In [1]:
# Install missing dependencies for VidEmo-7B (Qwen2.5-VL backbone)
# %pip install -q qwen-vl-utils

In [3]:
import torch
from pathlib import Path
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

BASE_DIR = Path("/mnt/Work/ML/Code/EmoRecVid")

print("Imports OK")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


Imports OK
CUDA available: True
GPU: NVIDIA GeForce RTX 3060
VRAM: 12.5 GB


In [4]:
MODEL_ID = "KlingTeam/VidEmo-3B"
PROCESSOR_ID = "Qwen/Qwen2.5-VL-3B-Instruct"   # base model that ships the processor config

print(f"Loading model: {MODEL_ID}")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    # attn_implementation="flash_attention_2",   # remove if flash-attn not installed
)
model.eval()

processor = AutoProcessor.from_pretrained(PROCESSOR_ID)

print("Model loaded successfully")
print(f"Model device map: {model.hf_device_map}")


Loading model: KlingTeam/VidEmo-3B


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Model loaded successfully
Model device map: {'model.visual': 0, 'model.language_model.embed_tokens': 0, 'lm_head': 0, 'model.language_model.layers.0': 0, 'model.language_model.layers.1': 0, 'model.language_model.layers.2': 0, 'model.language_model.layers.3': 0, 'model.language_model.layers.4': 0, 'model.language_model.layers.5': 0, 'model.language_model.layers.6': 0, 'model.language_model.layers.7': 0, 'model.language_model.layers.8': 0, 'model.language_model.layers.9': 0, 'model.language_model.layers.10': 0, 'model.language_model.layers.11': 'cpu', 'model.language_model.layers.12': 'cpu', 'model.language_model.layers.13': 'cpu', 'model.language_model.layers.14': 'cpu', 'model.language_model.layers.15': 'cpu', 'model.language_model.layers.16': 'cpu', 'model.language_model.layers.17': 'cpu', 'model.language_model.layers.18': 'cpu', 'model.language_model.layers.19': 'cpu', 'model.language_model.layers.20': 'cpu', 'model.language_model.layers.21': 'cpu', 'model.language_model.layers.22': 

In [5]:

def analyze_clip(video_path: str, max_new_tokens: int = 512, fps: float = 2.0) -> str:
    """
    Run VidEmo-3B on a single utterance video clip and return a detailed
    behavioral analysis (facial expressions, body language, behavioral cues).
    """
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": video_path,
                    "fps": fps,
                    "max_pixels": 360 * 420,
                },
                {
                    "type": "text",
                    "text": (
                        "Watch this short video clip of a person speaking and provide a detailed analysis covering:\n"
                        "1. **Facial Expressions**: Describe the movements and cues observed (e.g., brow furrowing, lip tension, eye widening, smile, grimace).\n"
                        "2. **Body Language**: Describe posture, gestures, head movements, and any notable physical cues.\n"
                        "3. **Behavioral Cues**: Note speech rate changes, pauses, energy level, and any other observable behavioral signals.\n"
                        "4. **Overall Emotional State**: Summarise what emotion(s) are most likely being expressed and why.\n"
                        "Be specific and descriptive."
                    ),
                },
            ],
        }
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
        **video_kwargs,
    ).to(model.device)

    with torch.inference_mode():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )

    generated_ids_trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    response = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0].strip()

    return response


# ── Analyse a single sample ───────────────────────────────────────────
SAMPLE_VIDEO = str(BASE_DIR / "utterance_clips/Session1/Ses01F_impro01/Ses01F_impro01_F000.avi")

print(f"Video: {SAMPLE_VIDEO}\n")
print("=" * 60)

analysis = analyze_clip(SAMPLE_VIDEO)
print(analysis)


Video: /mnt/Work/ML/Code/EmoRecVid/utterance_clips/Session1/Ses01F_impro01/Ses01F_impro01_F000.avi



qwen-vl-utils using torchvision to read video.


The video features a young adult woman with dark hair styled in a ponytail, sitting against a plain background. She is wearing a dark sleeveless top over a reddish-brown garment, complemented by a dark vest or wrap. A watch is visible on her left wrist, and she wears a small, dark hair accessory. Her expression is neutral, perhaps slightly pensive, as she gazes off-screen to her right. Her mouth remains mostly closed, with subtle movements suggesting she is speaking softly or listening intently. Her head is tilted slightly to the right, maintaining a consistent posture throughout the short clip. The lighting is dim, casting shadows that obscure some details of her face, making it difficult to discern finer facial features like eye color or skin tone. The overall impression is one of quiet contemplation or focused attention on something outside the frame.
