In [1]:
# Install missing dependencies for Tarsier2-Recap-7b (Qwen2-VL backbone)
# %pip install -q qwen-vl-utils

In [2]:
import os
import torch
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

BASE_DIR = Path("/mnt/Work/ML/Code/EmoRecVid")
MANIFEST_PATH = BASE_DIR / "utterance_clips" / "manifest.csv"

print("Imports OK")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


2026-02-28 06:46:01.928085: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-02-28 06:46:02.164913: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Imports OK
CUDA available: True
GPU: NVIDIA GeForce RTX 3060
VRAM: 12.5 GB


In [None]:
MODEL_ID = "omni-research/Tarsier2-Recap-7b"

print(f"Loading model: {MODEL_ID}")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    # local_files_only=True,
    # attn_implementation="flash_attention_2",   # remove if flash-attn not installed
)
model.eval()

processor = AutoProcessor.from_pretrained(MODEL_ID, local_files_only=True)

print("Model loaded successfully")
print(f"Model device map: {model.hf_device_map}")


`torch_dtype` is deprecated! Use `dtype` instead!


Loading model: omni-research/Tarsier2-Recap-7b


In [None]:

def analyze_clip(video_path: str, max_new_tokens: int = 512, fps: float = 2.0) -> str:
    """
    Run VidEmo-3B on a single utterance video clip and return a detailed
    behavioral analysis (facial expressions, body language, behavioral cues).
    """
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": video_path,
                    "fps": fps,
                    "max_pixels": 360 * 420,
                },
                {
                    "type": "text",
                    "text": (
                        "Watch this short video clip of a person speaking and provide a detailed analysis covering:\n"
                        "1. **Facial Expressions**: Describe the movements and cues observed (e.g., brow furrowing, lip tension, eye widening, smile, grimace).\n"
                        "2. **Body Language**: Describe posture, gestures, head movements, and any notable physical cues.\n"
                        "3. **Behavioral Cues**: Note speech rate changes, pauses, energy level, and any other observable behavioral signals.\n"
                        "4. **Overall Emotional State**: Summarise what emotion(s) are most likely being expressed and why.\n"
                        "Be specific and descriptive."
                    ),
                },
            ],
        }
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
        **video_kwargs,
    ).to(model.device)

    with torch.inference_mode():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )

    generated_ids_trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    response = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0].strip()

    return response

print("analyze_clip() ready")


analyze_clip() ready


In [None]:

# Load manifest – preserve any already-completed analyses
manifest = pd.read_csv(MANIFEST_PATH)

total  = len(manifest)
done_before = manifest["vlm_analysis"].notna().sum()
print(f"Total clips : {total}")
print(f"Already done: {done_before}  |  Remaining: {total - done_before}")

# ── Batch inference helper ────────────────────────────────────────────
BATCH_SIZE = 8      # increase for throughput (reduce if OOM)
SAVE_EVERY = 20     # checkpoint to disk every N clips
FPS        = 1.0    # frames-per-second fed to the model (fewer frames = faster)


def analyze_batch(video_paths: list[str], max_new_tokens: int = 256, fps: float = FPS) -> list[str]:
    """Run the VLM on a batch of video clips and return one response per clip."""
    PROMPT_TEXT = (
        "Watch this short video clip of a person speaking and provide a detailed analysis covering:\n"
        "1. **Facial Expressions**: Describe the movements and cues observed (e.g., brow furrowing, lip tension, eye widening, smile, grimace).\n"
        "2. **Body Language**: Describe posture, gestures, head movements, and any notable physical cues.\n"
        "3. **Behavioral Cues**: Note speech rate changes, pauses, energy level, and any other observable behavioral signals.\n"
        "4. **Overall Emotional State**: Summarise what emotion(s) are most likely being expressed and why.\n"
        "Be specific and descriptive."
    )

    all_messages = [
        [
            {
                "role": "user",
                "content": [
                    {"type": "video", "video": vp, "fps": fps, "max_pixels": 360 * 420},
                    {"type": "text", "text": PROMPT_TEXT},
                ],
            }
        ]
        for vp in video_paths
    ]

    texts = [
        processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
        for msgs in all_messages
    ]

    image_inputs, video_inputs, video_kwargs = process_vision_info(
        all_messages, return_video_kwargs=True
    )

    inputs = processor(
        text=texts,
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
        **video_kwargs,
    ).to(model.device)

    with torch.inference_mode():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )

    generated_ids_trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    responses = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return [r.strip() for r in responses]


# ── Main loop ─────────────────────────────────────────────────────────
# Only process rows that have not been analysed yet (NaN) or previously errored
pending = list(manifest[manifest["vlm_analysis"].isna()].index)
print(f"Queued for processing: {len(pending)} clips\n")

processed = 0

with tqdm(total=len(pending), desc="VLM analysis") as pbar:
    for batch_start in range(0, len(pending), BATCH_SIZE):
        batch_idx = pending[batch_start : batch_start + BATCH_SIZE]
        batch_rows = manifest.loc[batch_idx]

        video_paths = [str(BASE_DIR / row["path"]) for _, row in batch_rows.iterrows()]

        try:
            results = analyze_batch(video_paths)
            for idx, analysis in zip(batch_idx, results):
                manifest.at[idx, "vlm_analysis"] = analysis
        except Exception as e:
            for idx in batch_idx:
                manifest.at[idx, "vlm_analysis"] = f"ERROR: {e}"
            print(f"\nError on batch {batch_idx}: {e}")

        processed += len(batch_idx)
        pbar.update(len(batch_idx))

        # Periodic checkpoint
        if processed % SAVE_EVERY == 0:
            manifest.to_csv(MANIFEST_PATH, index=False)

# Final save
manifest.to_csv(MANIFEST_PATH, index=False)

done   = manifest["vlm_analysis"].notna().sum()
errors = manifest["vlm_analysis"].str.startswith("ERROR:", na=False).sum()
print(f"\nDone. {done}/{total} clips processed, {errors} errors.")
print(f"Results saved → {MANIFEST_PATH}")
manifest.head(3)


Total clips : 10039
Already done: 9937  |  Remaining: 102
Queued for processing: 102 clips



VLM analysis:   0%|          | 0/102 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



Done. 10039/10039 clips processed, 0 errors.
Results saved → /mnt/Work/ML/Code/EmoRecVid/utterance_clips/manifest.csv


Unnamed: 0,utterance_id,emotion,speaker_gender,crop_side,start,end,path,vlm_analysis
0,Ses01F_impro01_F000,neu,F,left,6.2901,8.2357,utterance_clips/Session1/Ses01F_impro01/Ses01F...,The video features a young adult woman with da...
1,Ses01F_impro01_F001,neu,F,left,10.01,11.3925,utterance_clips/Session1/Ses01F_impro01/Ses01F...,The video features a young adult woman with da...
2,Ses01F_impro01_F002,neu,F,left,14.8872,18.0175,utterance_clips/Session1/Ses01F_impro01/Ses01F...,The video features a young adult female with s...


In [None]:

# ── Retry empty / errored clips ───────────────────────────────────────
manifest = pd.read_csv(MANIFEST_PATH)

is_error = manifest["vlm_analysis"].str.startswith("ERROR:", na=False)
is_empty = manifest["vlm_analysis"].isna()
retry_mask = is_error | is_empty

retry_idx = list(manifest[retry_mask].index)
print(f"Empty  : {is_empty.sum()}")
print(f"Errored: {is_error.sum()}")
print(f"Total to retry: {len(retry_idx)}")

processed = 0

with tqdm(total=len(retry_idx), desc="Retry VLM analysis") as pbar:
    for batch_start in range(0, len(retry_idx), BATCH_SIZE):
        batch_idx = retry_idx[batch_start : batch_start + BATCH_SIZE]
        batch_rows = manifest.loc[batch_idx]

        video_paths = [str(BASE_DIR / row["path"]) for _, row in batch_rows.iterrows()]

        try:
            results = analyze_batch(video_paths)
            for idx, analysis in zip(batch_idx, results):
                manifest.at[idx, "vlm_analysis"] = analysis
        except Exception as e:
            for idx in batch_idx:
                manifest.at[idx, "vlm_analysis"] = f"ERROR: {e}"
            print(f"\nError on batch {batch_idx}: {e}")

        processed += len(batch_idx)
        pbar.update(len(batch_idx))

        if processed % SAVE_EVERY == 0:
            manifest.to_csv(MANIFEST_PATH, index=False)

# Final save
manifest.to_csv(MANIFEST_PATH, index=False)

total   = len(manifest)
done    = manifest["vlm_analysis"].notna().sum()
errors  = manifest["vlm_analysis"].str.startswith("ERROR:", na=False).sum()
print(f"\nDone. {done}/{total} clips processed, {errors} errors remaining.")
print(f"Results saved → {MANIFEST_PATH}")
manifest[retry_mask].head(3)


Empty  : 0
Errored: 0
Total to retry: 0


Retry VLM analysis: 0it [00:00, ?it/s]


Done. 10039/10039 clips processed, 0 errors remaining.
Results saved → /mnt/Work/ML/Code/EmoRecVid/utterance_clips/manifest.csv


Unnamed: 0,utterance_id,emotion,speaker_gender,crop_side,start,end,path,vlm_analysis


In [None]:
# analyze_clip("utterance_clips/Session1/Ses01F_impro06/Ses01F_impro06_F006.avi", max_new_tokens=512, fps=1.0)

qwen-vl-utils using torchvision to read video.


"The video features a young adult woman with shoulder-length dark brown hair styled in a low ponytail, sitting against a plain off-white wall. She wears a black headband adorned with a small white flower and a dark bracelet on her left wrist. Her face is round, with a rounded chin and narrow lips. Initially, she appears pensive or neutral, with her gaze directed downwards towards her lap. Her mouth corners are slightly downturned, and her eyebrows remain neutral. She fidgets with her hands, suggesting a degree of restlessness or preoccupation. As she begins to speak, her mouth moves, and her teeth become briefly visible. Her gaze shifts upwards momentarily before returning to its downward position. Her expression remains largely unchanged, though there's a subtle shift towards a more neutral demeanor as she continues talking. Her head remains relatively still throughout, with only slight tilts and nods accompanying her speech. The lighting is dim, casting subtle shadows on her face, pa