In [1]:
import torchvision
import transformers
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, GenerationConfig
from qwen_vl_utils import process_vision_info

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

Loading checkpoint shards: 100%|██████████| 5/5 [00:05<00:00,  1.05s/it]


In [13]:
import os
import subprocess
import torch

def generate_scene_caption(scene_data, previous_caption=None):
    # Clear CUDA cache
    torch.cuda.empty_cache()

    video_length = scene_data.get("duration", 0)
    fps = 1.0
    max_frames_value = min(32, int(video_length * fps))

    previous_context = ""
    if previous_caption:
        previous_context = (
            f"Previously described:\n{previous_caption}\n\n"
            "Do not repeat the same information.\n"
        )

    if scene_data.get("captions"):
        captions_text = "\n".join([f"- {caption}" for caption in scene_data["captions"]])
        scene_context = f"Scene {scene_data['scene_number']} captions:\n{captions_text}\n"
    else:
        scene_context = ""

    # Build audio context if an audio transcript is available
    audio_context = ""
    if scene_data.get("transcript_path"):
        try:
            with open(scene_data["transcript_path"], "r", encoding="utf-8") as af:
                audio_text = af.read().strip()
            if audio_text:
                audio_context = (
                    f"Audio transcript for Scene {scene_data['scene_number']}:\n{audio_text}\n\n"
                    "Important: When describing the scene, combine what you observe visually "
                    "with the context provided in the audio transcript. Use the audio information "
                    "to enrich your understanding of the events and actions happening in the scene. "
                    "Create a natural description that weaves together both visual and audio elements.\n"
                )
        except Exception as e:
            print(f"Error reading audio transcript: {e}")

    # Base prompt 
    base_prompt = (
        "Describe what is happening in this scene by integrating visual observations "
        "with available audio context. Consider both what you see and any additional "
        "context from the audio transcript to provide a complete understanding of the scene. "
        "Create a natural, flowing description that explains both the what and why of "
        "the events unfolding."
    )

    # Enhanced prompt.
    enhanced_prompt = (
        f"{previous_context}"
        f"{scene_context}\n"
        f"{audio_context}\n"
        f"{base_prompt}\n"
    )


    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": scene_data['video_path'],
                    "max_pixels": 320 * 240,
                    "fps": fps,
                    "max_frames": max_frames_value,
                },
                {"type": "text", "text": enhanced_prompt},
            ],
        }
    ]

    print(f"Processing Scene {scene_data['scene_number']} with max_frames={max_frames_value}")

    dtype = torch.bfloat16

    with torch.amp.autocast('cuda', dtype=dtype):
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )

        inputs = inputs.to("cuda", dtype=dtype)
        input_length = inputs.input_ids.shape[1]

        with torch.no_grad():
            generation_config = GenerationConfig(
                max_new_tokens=128,
                do_sample=False,
                num_beams=1
            )

            generated_ids = model.generate(
                **inputs,
                generation_config=generation_config
            )

        generated_ids_trimmed = generated_ids[:, input_length:]
        output_list = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )

    del inputs, generated_ids, generated_ids_trimmed
    torch.cuda.empty_cache()
    output_text = ''.join(output_list)
    if output_text:
        clean_text = output_text.split("addCriterion")[0].strip()
    return clean_text if output_text else output_text

In [8]:
import json
def get_scene_data(scene_number, scenes_json_path):
    with open(scenes_json_path, 'r') as f:
        scenes = json.load(f)

    for scene in scenes:
        if scene["scene_number"] == scene_number:
            return scene

    raise ValueError(f"Scene {scene_number} not found in {scenes_json_path}")
scene_2_data = get_scene_data(2, f"videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes/scene_info.json")

In [14]:
scene_2_data

{'scene_number': 2,
 'start_frame': 744,
 'end_frame': 1330,
 'start_time': 31.03094176152061,
 'end_time': 55.47197922422367,
 'duration': 24.441037462703058,
 'video_path': 'videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes/scene_002.mp4',
 'transcript_path': 'videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes/scene_002.txt'}

In [15]:
generate_scene_caption(scene_2_data)

Processing Scene 2 with max_frames=24


'In this scene, the setting appears to be a dimly lit, possibly outdoor or semi-outdoor environment, given the presence of a crowd in the background. The focus is on an older man, presumably Uncle Ben, who is seated and appears to be in a state of distress or exhaustion. His expression is one of concern or worry, and his eyes are wide open, suggesting he might be in a moment of realization or shock. He is dressed in a brown jacket over a red and white checkered shirt, which adds to the sense of a rugged or practical individual.\n\nThe audio transcript reveals that the younger man, Peter, is calling'