In [1]:
# Install missing dependencies for Tarsier2-Recap-7b (Qwen2-VL backbone)
# %pip install -q qwen-vl-utils

# Install missing dependencies for FaceScanPaliGemma_Emotion
# %pip install -q transformers>=4.42.0 Pillow opencv-python


In [2]:
import os
import torch
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

BASE_DIR = Path.cwd()
MANIFEST_PATH = BASE_DIR / "utterance_clips" / "manifest.csv"

print("Imports OK")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


2026-03-01 02:08:35.258835: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-03-01 02:08:35.474770: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Imports OK
CUDA available: True
GPU: NVIDIA GeForce RTX 3060
VRAM: 12.5 GB


In [3]:

MODEL_ID = "KlingTeam/VidEmo-3B"
PROCESSOR_ID = "Qwen/Qwen2.5-VL-3B-Instruct"   # base model that ships the processor config

print(f"Loading model: {MODEL_ID}")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    # attn_implementation="flash_attention_2",   # remove if flash-attn not installed
)
model.eval()

processor = AutoProcessor.from_pretrained(PROCESSOR_ID)
# Decoder-only models require left-padding for correct batched generation
processor.tokenizer.padding_side = "left"

print("Model loaded successfully")


`torch_dtype` is deprecated! Use `dtype` instead!


Loading model: KlingTeam/VidEmo-3B


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


Model loaded successfully


In [4]:

PROMPT_TEXT = (
    "Watch this short video clip of a person speaking and describe how their emotional state evolves over time.\n"
    "Structure your response as a temporal progression — divide the clip into beginning, middle, and end (or more segments if the emotion changes more than once).\n"
    "For each segment describe:\n"
    "- **Facial Expressions**: Specific muscle movements (brow, lips, eyes, jaw, cheeks).\n"
    "- **Head & Gaze**: Tilts, nods, shakes, eye direction.\n"
    "- **Body Language**: Posture shifts, gestures, tension or relaxation.\n"
    "- **Emotion at this moment**: The most likely emotion and the visual cues supporting it.\n"
    "Finish with a one-sentence summary of the overall emotional arc (e.g., starts neutral → builds frustration → brief smile at end).\n"
    "Ground every observation in a specific visible signal."
)


def analyze_clip(video_path: str, max_new_tokens: int = 512, fps: float = 2.0) -> str:
    """
    Run VidEmo-3B on a single utterance video clip and return a temporal
    emotional arc analysis (how expressions change from start to end).
    """
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": video_path,
                    "fps": fps,
                    "max_pixels": 360 * 420,
                },
                {"type": "text", "text": PROMPT_TEXT},
            ],
        }
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
        **video_kwargs,
    ).to(model.device)

    with torch.inference_mode():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )

    generated_ids_trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    response = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0].strip()

    return response

print("analyze_clip() ready")


analyze_clip() ready


In [5]:

# Load manifest – preserve any already-completed analyses
manifest = pd.read_csv(MANIFEST_PATH)
manifest["vlm_analysis"] = manifest["vlm_analysis"].astype("object")
# Create column if it doesn't exist
if "vlm_analysis" not in manifest.columns:
    manifest["vlm_analysis"] = pd.NA
manifest['vlm_analysis'] = None
manifest.to_csv(MANIFEST_PATH, index=False)

In [6]:


total  = len(manifest)
done_before = manifest["vlm_analysis"].notna().sum()
print(f"Total clips : {total}")
print(f"Already done: {done_before}  |  Remaining: {total - done_before}")

# ── Config ────────────────────────────────────────────────────────────
BATCH_SIZE = 8      # increase for throughput (reduce if OOM)
SAVE_EVERY = 20     # checkpoint to disk every N clips
FPS        = 1.0    # higher fps = more temporal resolution for tracking changes


def analyze_batch(video_paths: list[str], max_new_tokens: int = 512, fps: float = FPS) -> list[str]:
    """Run the VLM on a batch of video clips and return a temporal arc analysis per clip."""
    PROMPT_TEXT = (
        "Watch this short video clip of a person speaking and describe how their emotional state evolves over time.\n"
        "Structure your response as a temporal progression — divide the clip into beginning, middle, and end (or more segments if the emotion changes more than once).\n"
        "For each segment describe:\n"
        "- **Facial Expressions**: Specific muscle movements (brow, lips, eyes, jaw, cheeks).\n"
        "- **Head & Gaze**: Tilts, nods, shakes, eye direction.\n"
        "- **Body Language**: Posture shifts, gestures, tension or relaxation.\n"
        "- **Emotion at this moment**: The most likely emotion and the visual cues supporting it.\n"
        "Finish with a one-sentence summary of the overall emotional arc (e.g., starts neutral → builds frustration → brief smile at end).\n"
        "Ground every observation in a specific visible signal."
    )

    all_messages = [
        [
            {
                "role": "user",
                "content": [
                    {"type": "video", "video": vp, "fps": fps, "max_pixels": 360 * 420},
                    {"type": "text", "text": PROMPT_TEXT},
                ],
            }
        ]
        for vp in video_paths
    ]

    texts = [
        processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
        for msgs in all_messages
    ]

    image_inputs, video_inputs, video_kwargs = process_vision_info(
        all_messages, return_video_kwargs=True
    )

    if "fps" in video_kwargs and isinstance(video_kwargs["fps"], list):
        del video_kwargs["fps"]

    inputs = processor(
        text=texts,
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
        **video_kwargs,
    ).to(model.device)

    with torch.inference_mode():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )

    generated_ids_trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    responses = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return [r.strip() for r in responses]


# ── Main loop ─────────────────────────────────────────────────────────
pending = list(manifest[manifest["vlm_analysis"].isna()].index)
print(f"Queued for processing: {len(pending)} clips\n")

processed = 0

with tqdm(total=len(pending), desc="VLM analysis") as pbar:
    for batch_start in range(0, len(pending), BATCH_SIZE):
        batch_idx = pending[batch_start : batch_start + BATCH_SIZE]
        batch_rows = manifest.loc[batch_idx]

        video_paths = [str(BASE_DIR / row["path"]) for _, row in batch_rows.iterrows()]

        try:
            results = analyze_batch(video_paths)
            for idx, analysis in zip(batch_idx, results):
                manifest.at[idx, "vlm_analysis"] = analysis
        except Exception as e:
            for idx in batch_idx:
                manifest.at[idx, "vlm_analysis"] = f"ERROR: {e}"
            print(f"\nError on batch {batch_idx}: {e}")

        prev_processed = processed
        processed += len(batch_idx)
        pbar.update(len(batch_idx))

        if processed // SAVE_EVERY > prev_processed // SAVE_EVERY:
            manifest.to_csv(MANIFEST_PATH, index=False)

# Final save
manifest.to_csv(MANIFEST_PATH, index=False)

done   = manifest["vlm_analysis"].notna().sum()
errors = manifest["vlm_analysis"].str.startswith("ERROR:", na=False).sum()
print(f"\nDone. {done}/{total} clips processed, {errors} errors.")
print(f"Results saved → {MANIFEST_PATH}")
manifest.head(3)



Total clips : 10039
Already done: 0  |  Remaining: 10039
Queued for processing: 10039 clips



VLM analysis:   0%|          | 0/10039 [00:00<?, ?it/s]

qwen-vl-utils using torchvision to read video.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


KeyboardInterrupt: 

In [7]:

# ── Retry empty / errored / bad-output clips (loops until all are clean) ──
manifest = pd.read_csv(MANIFEST_PATH)

def _is_bad_output(val) -> bool:
    """True if a non-error value looks like a corrupted/truncated generation."""
    if pd.isna(val) or str(val).startswith("ERROR:"):
        return False
    s = str(val).strip()
    words = s.split()
    if len(words) <= 10:
        return True
    if words[0].lower() == "the" and words[-1].lower() == "the":
        return True
    return False

def get_retry_mask(df):
    is_error = df["vlm_analysis"].str.startswith("ERROR:", na=False)
    is_empty = df["vlm_analysis"].isna()
    is_bad   = df["vlm_analysis"].map(_is_bad_output)
    return is_error | is_empty | is_bad

pass_num = 0

while True:
    retry_mask = get_retry_mask(manifest)
    retry_idx  = list(manifest[retry_mask].index)

    is_error = manifest["vlm_analysis"].str.startswith("ERROR:", na=False)
    is_empty = manifest["vlm_analysis"].isna()
    is_bad   = manifest["vlm_analysis"].map(_is_bad_output)

    print(f"\n── Pass {pass_num} ── Empty: {is_empty.sum()}  |  Errored: {is_error.sum()}  |  Bad: {is_bad.sum()}  |  Total to fix: {len(retry_idx)}")

    if len(retry_idx) == 0:
        print("All clips are clean — done!")
        break

    pass_num += 1
    processed = 0

    with tqdm(total=len(retry_idx), desc=f"Retry pass {pass_num}") as pbar:
        for batch_start in range(0, len(retry_idx), BATCH_SIZE):
            batch_idx  = retry_idx[batch_start : batch_start + BATCH_SIZE]
            batch_rows = manifest.loc[batch_idx]

            video_paths = [str(BASE_DIR / row["path"]) for _, row in batch_rows.iterrows()]

            try:
                results = analyze_batch(video_paths)
                for idx, analysis in zip(batch_idx, results):
                    manifest.at[idx, "vlm_analysis"] = analysis
            except Exception as e:
                for idx in batch_idx:
                    manifest.at[idx, "vlm_analysis"] = f"ERROR: {e}"
                print(f"\nError on batch {list(batch_idx)}: {e}")

            prev_processed = processed
            processed += len(batch_idx)
            pbar.update(len(batch_idx))

            if processed // SAVE_EVERY > prev_processed // SAVE_EVERY:
                manifest.to_csv(MANIFEST_PATH, index=False)

    manifest.to_csv(MANIFEST_PATH, index=False)

# Final summary
total  = len(manifest)
done   = manifest["vlm_analysis"].notna().sum()
errors = manifest["vlm_analysis"].str.startswith("ERROR:", na=False).sum()
print(f"\nFinal: {done}/{total} clips processed, {errors} errors, {pass_num} retry pass(es).")
print(f"Results saved → {MANIFEST_PATH}")
manifest.head(3)



── Pass 0 ── Empty: 10015  |  Errored: 0  |  Bad: 0  |  Total to fix: 10015


Retry pass 1:   0%|          | 0/10015 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


KeyboardInterrupt: 

In [None]:
# analyze_clip("utterance_clips/Session1/Ses01F_impro06/Ses01F_impro06_F006.avi", max_new_tokens=512, fps=1.0)

In [None]:

# # ── FaceScanPaliGemma_Emotion ─────────────────────────────────────────
# # Fine-tuned PaliGemma-3B on AffectNet; classifies a face image into one of:
# #   neutral | happy | sad | surprise | fear | disgust | anger | contempt
# # Model card: https://huggingface.co/NYUAD-ComNets/FaceScanPaliGemma_Emotion

# import os
# # Disable the xet/CAS backend so HF falls back to standard HTTP downloads
# os.environ["HF_HUB_DISABLE_XET_BACKEND"] = "1"

# import torch
# import pandas as pd
# import cv2
# import numpy as np
# from pathlib import Path
# from PIL import Image
# from tqdm.auto import tqdm
# from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration
# from collections import Counter

# BASE_DIR = Path("/mnt/Work/ML/Code/EmoRecVid")
# MANIFEST_PATH = BASE_DIR / "utterance_clips" / "manifest.csv"

# PG_MODEL_ID = "NYUAD-ComNets/FaceScanPaliGemma_Emotion"
# PG_PROC_ID  = "google/paligemma-3b-pt-224"
# PG_PROMPT   = "what is the emotion of the person in the image?"

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Device: {device}")

# print(f"Loading processor from {PG_PROC_ID} …")
# pg_processor = PaliGemmaProcessor.from_pretrained(PG_PROC_ID)

# print(f"Loading model from {PG_MODEL_ID} …")
# pg_model = PaliGemmaForConditionalGeneration.from_pretrained(
#     PG_MODEL_ID,
#     torch_dtype=torch.bfloat16,
# )
# pg_model.to(device)
# pg_model.eval()
# print("PaliGemma emotion model loaded.")


In [None]:

# VALID_EMOTIONS = {"neutral", "happy", "sad", "surprise", "fear", "disgust", "anger", "contempt"}

# def extract_frames(video_path: str, n_frames: int = 5) -> list[Image.Image]:
#     """Extract `n_frames` evenly-spaced frames from a video file as PIL images."""
#     cap = cv2.VideoCapture(video_path)
#     total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
#     if total == 0:
#         cap.release()
#         return []
#     indices = np.linspace(0, total - 1, min(n_frames, total), dtype=int)
#     frames = []
#     for idx in indices:
#         cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
#         ret, frame = cap.read()
#         if ret:
#             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#             frames.append(Image.fromarray(frame_rgb))
#     cap.release()
#     return frames


# # PaliGemma requires the <image> token at the start of the prompt
# _PG_PROMPT_WITH_TOKEN = "<image> " + PG_PROMPT

# def predict_emotion_video(video_path: str, n_frames: int = 5) -> str:
#     """
#     Extract frames, run PaliGemma on each frame, and return the per-frame
#     emotion labels as a space-separated string, e.g.:
#         "neutral happy neutral sad neutral"
#     Labels that fall outside VALID_EMOTIONS are included as-is so no
#     information is lost.
#     Returns "ERROR: no frames" if the video has no readable frames.
#     """
#     frames = extract_frames(video_path, n_frames=n_frames)
#     if not frames:
#         return "ERROR: no frames"

#     per_frame_labels = []

#     for frame in frames:
#         inputs = pg_processor(
#             text=_PG_PROMPT_WITH_TOKEN,
#             images=frame,
#             padding="longest",
#             do_convert_rgb=True,
#             return_tensors="pt",
#         ).to(device)
#         inputs = inputs.to(dtype=pg_model.dtype)

#         with torch.no_grad():
#             output = pg_model.generate(**inputs, max_length=500)

#         # Trim input tokens, decode only newly generated tokens
#         new_tokens = output[0][inputs.input_ids.shape[1]:]
#         decoded = pg_processor.decode(new_tokens, skip_special_tokens=True).strip().lower()

#         # First word as the label candidate
#         label = decoded.split()[0] if decoded.split() else "unknown"
#         per_frame_labels.append(label)

#     return " ".join(per_frame_labels)


# print("predict_emotion_video() ready  (returns space-separated per-frame labels)")
# print(f"Prompt: {_PG_PROMPT_WITH_TOKEN!r}")
# print(f"Extracts 5 frames per clip – one label per frame, all stored.")


In [None]:

# # ── Quick sanity-check: sample a random clip and visualise predictions ─
# import matplotlib.pyplot as plt
# from IPython.display import display

# manifest_preview = pd.read_csv(MANIFEST_PATH)

# # Pick a random row
# sample_row = manifest_preview.sample(1).iloc[0]
# sample_path = str(BASE_DIR / sample_row["path"])

# print(f"Utterance : {sample_row['utterance_id']}")
# print(f"GT emotion: {sample_row['emotion']}")
# print(f"Video path: {sample_path}")
# print()

# # Run through predict_emotion_video – returns "label1 label2 label3 ..."
# result_str = predict_emotion_video(sample_path, n_frames=5)
# per_frame_labels = result_str.split()
# valid = [l for l in per_frame_labels if l in VALID_EMOTIONS]

# print(f"Per-frame labels : {per_frame_labels}")
# print(f"Valid labels     : {valid}")
# print(f"Dominant         : {Counter(valid).most_common(1)[0][0] if valid else 'none'}")

# # Plot frames with their labels
# frames = extract_frames(sample_path, n_frames=5)
# n = len(frames)
# fig, axes = plt.subplots(1, n, figsize=(4 * n, 4))
# if n == 1:
#     axes = [axes]

# for ax, frame, lbl in zip(axes, frames, per_frame_labels):
#     ax.imshow(frame)
#     color = "green" if lbl in VALID_EMOTIONS else "red"
#     ax.set_title(lbl, fontsize=13, color=color, fontweight="bold")
#     ax.axis("off")

# dominant = Counter(valid).most_common(1)[0][0] if valid else "none"
# fig.suptitle(
#     f"GT: {sample_row['emotion']}   |   Labels: {' '.join(per_frame_labels)}   |   Dominant: {dominant}",
#     fontsize=12, fontweight="bold", y=1.02
# )
# plt.tight_layout()
# plt.show()


In [None]:

# # ── Main inference loop ───────────────────────────────────────────────
# manifest = pd.read_csv(MANIFEST_PATH)

# # Ensure the column exists
# if "paligemma_emotion" not in manifest.columns:
#     manifest["paligemma_emotion"] = pd.NA

# # Reset all existing predictions so everything gets reprocessed
# manifest["paligemma_emotion"] = pd.NA

# total   = len(manifest)
# pending = list(manifest.index)

# print(f"Total clips to process: {total}")
# print("Output format: space-separated per-frame labels, e.g. 'neutral happy neutral sad neutral'")

# PG_N_FRAMES   = 5     # frames sampled per clip
# PG_SAVE_EVERY = 50    # checkpoint frequency

# processed = 0

# with tqdm(total=len(pending), desc="PaliGemma emotion") as pbar:
#     for idx in pending:
#         row = manifest.loc[idx]
#         video_path = str(BASE_DIR / row["path"])

#         try:
#             result = predict_emotion_video(video_path, n_frames=PG_N_FRAMES)
#             manifest.at[idx, "paligemma_emotion"] = result
#         except Exception as e:
#             manifest.at[idx, "paligemma_emotion"] = f"ERROR: {e}"
#             print(f"\nError on {row['utterance_id']}: {e}")

#         processed += 1
#         pbar.update(1)

#         if processed % PG_SAVE_EVERY == 0:
#             manifest.to_csv(MANIFEST_PATH, index=False)

# # Final save
# manifest.to_csv(MANIFEST_PATH, index=False)

# done   = manifest["paligemma_emotion"].notna().sum()
# errors = manifest["paligemma_emotion"].str.startswith("ERROR:", na=False).sum()
# print(f"\nDone. {done}/{total} clips processed, {errors} errors.")
# print(f"Results saved → {MANIFEST_PATH}")
# manifest[["utterance_id", "emotion", "paligemma_emotion"]].head(10)


In [None]:

# # ── Retry errored / empty / no-valid-emotion PaliGemma predictions ────
# manifest = pd.read_csv(MANIFEST_PATH)

# def _has_no_valid_emotion(val) -> bool:
#     """True if val is a non-error string but contains no recognised emotion word."""
#     if pd.isna(val) or str(val).startswith("ERROR:"):
#         return False
#     return not any(w in VALID_EMOTIONS for w in str(val).split())

# is_error      = manifest["paligemma_emotion"].str.startswith("ERROR:", na=False)
# is_empty      = manifest["paligemma_emotion"].isna()
# is_no_emotion = manifest["paligemma_emotion"].map(_has_no_valid_emotion)
# retry_mask    = is_error | is_empty | is_no_emotion
# retry_idx     = list(manifest[retry_mask].index)

# print(f"Empty          : {is_empty.sum()}")
# print(f"Errored        : {is_error.sum()}")
# print(f"No valid emotion: {is_no_emotion.sum()}")
# print(f"Total to retry : {len(retry_idx)}")

# processed = 0

# with tqdm(total=len(retry_idx), desc="Retry PaliGemma") as pbar:
#     for idx in retry_idx:
#         row = manifest.loc[idx]
#         video_path = str(BASE_DIR / row["path"])

#         try:
#             result = predict_emotion_video(video_path, n_frames=PG_N_FRAMES)
#             manifest.at[idx, "paligemma_emotion"] = result
#         except Exception as e:
#             manifest.at[idx, "paligemma_emotion"] = f"ERROR: {e}"
#             print(f"\nError on {row['utterance_id']}: {e}")

#         processed += 1
#         pbar.update(1)

#         if processed % PG_SAVE_EVERY == 0:
#             manifest.to_csv(MANIFEST_PATH, index=False)

# # Final save
# manifest.to_csv(MANIFEST_PATH, index=False)

# total        = len(manifest)
# done         = manifest["paligemma_emotion"].notna().sum()
# errors       = manifest["paligemma_emotion"].str.startswith("ERROR:", na=False).sum()
# still_no_emo = manifest["paligemma_emotion"].map(_has_no_valid_emotion).sum()
# print(f"\nDone. {done}/{total} clips processed.")
# print(f"  Errors remaining        : {errors}")
# print(f"  Still no valid emotion  : {still_no_emo}")
# print(f"Results saved → {MANIFEST_PATH}")
# manifest[retry_mask][["utterance_id", "emotion", "paligemma_emotion"]].head(10)
