In [1]:
import torchvision
import transformers
import torch

In [2]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, GenerationConfig
from qwen_vl_utils import process_vision_info

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [3]:
import os
import subprocess
import torch

def generate_scene_caption(scene_data, previous_caption=None):
    # Clear CUDA cache
    torch.cuda.empty_cache()

    video_length = scene_data.get("duration", 0)
    fps = 1.0
    max_frames_value = min(32, int(video_length * fps))

    previous_context = ""
    if previous_caption:
        previous_context = (
            f"Previously described:\n{previous_caption}\n\n"
            "Do not repeat the same information.\n"
        )

    if scene_data.get("captions"):
        captions_text = "\n".join([f"- {caption}" for caption in scene_data["captions"]])
        scene_context = f"Scene {scene_data['scene_number']} captions:\n{captions_text}\n"
    else:
        scene_context = ""

    # Build audio context if an audio transcript is available
    audio_context = ""
    if scene_data.get("transcript_path"):
        try:
            with open(scene_data["transcript_path"], "r", encoding="utf-8") as af:
                audio_text = af.read().strip()
            if audio_text:
                audio_context = (
                    f"Audio transcript for Scene {scene_data['scene_number']}:\n{audio_text}\n"
                    "Important: When describing the scene, combine what you observe visually with the context provided in the audio transcript. " 
                    "Use the audio information to enrich your understanding of the events and actions happening in the scene. "
                    "Create a natural description that weaves together both visual and audio elements.\n"
                )

        except Exception as e:
            print(f"Error reading audio transcript: {e}")

    # Base prompt 
    base_prompt = (
    "Describe what is happening in this scene by integrating visual observations with available audio context and scene context. "
    "Consider both what you see and any additional context from the audio transcript and scene context to provide a complete understanding of the scene. "
    "Important: Be concise and conversational. The generated description should not be more than 5 seconds long."
    )

    # Enhanced prompt.
    enhanced_prompt = (
        f"{previous_context}"
        f"{scene_context}\n"
        f"{audio_context}\n"
        f"{base_prompt}\n"
    )


    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": scene_data['video_path'],
                    "max_pixels": 320 * 240,
                    "fps": fps,
                    "max_frames": max_frames_value,
                },
                {"type": "text", "text": enhanced_prompt},
            ],
        }
    ]

    print(f"Processing Scene {scene_data['scene_number']} with max_frames={max_frames_value}")

    dtype = torch.bfloat16

    with torch.amp.autocast('cuda', dtype=dtype):
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )

        inputs = inputs.to("cuda", dtype=dtype)
        input_length = inputs.input_ids.shape[1]

        with torch.no_grad():
            generation_config = GenerationConfig(
                max_new_tokens=128,
                do_sample=False,
                num_beams=1
            )

            generated_ids = model.generate(
                **inputs,
                generation_config=generation_config
            )

        generated_ids_trimmed = generated_ids[:, input_length:]
        output_list = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )

    del inputs, generated_ids, generated_ids_trimmed
    torch.cuda.empty_cache()
    output_text = ''.join(output_list)
    if output_text:
        clean_text = output_text.split("addCriterion")[0].strip()
    return clean_text if output_text else output_text

In [22]:
import json
def get_scene_data(scene_number, scenes_json_path):
    with open(scenes_json_path, 'r') as f:
        scenes = json.load(f)

    for scene in scenes:
        if scene["scene_number"] == scene_number:
            return scene

    raise ValueError(f"Scene {scene_number} not found in {scenes_json_path}")
scene_2_data = get_scene_data(2, f"videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes/scene_info.json")

In [5]:
scene_2_data

{'scene_number': 2,
 'start_frame': 744,
 'end_frame': 1330,
 'start_time': 31.03094176152061,
 'end_time': 55.47197922422367,
 'duration': 24.441037462703058,
 'video_path': 'videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes/scene_002.mp4',
 'transcript_path': 'videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes/scene_002.txt'}

In [26]:
generate_scene_caption(scene_2_data)

Processing Scene 2 with max_frames=24


'The scene shows an older man, presumably Uncle Ben, in a distressed state, looking upwards with a worried expression. A younger man, Peter, is seen comforting him, gently holding his hand. The audio indicates that Peter is trying to reassure Uncle Ben, calling out his name and promising to stay with him. The setting appears to be a dimly lit, possibly outdoor or industrial environment, adding to the tension of the moment.'

In [4]:
import os
import json

def process_all_scenes(scene_folder):
    scenes_json_path = os.path.join(scene_folder, "scene_info.json")
    if not os.path.exists(scenes_json_path):
        print(f"Error: scene_info.json not found in {scene_folder}")
        return

    with open(scenes_json_path, "r") as f:
        scene_list = json.load(f)

    print(f"Processing {len(scene_list)} scenes in {scene_folder}...")

    captions = []
    previous_caption = None

    for idx, scene_data in enumerate(scene_list, start=1):
        scene_caption = generate_scene_caption(scene_data, previous_caption=previous_caption)
        captions.append(scene_caption)
        previous_caption = scene_caption

    # Merge captions
    final_caption_text = "\n".join(
        [f"Scene {i+1}: {desc}" for i, desc in enumerate(captions)]
    )

    # Save captions to a text file
    caption_output_path = os.path.join(scene_folder, "final_captions.txt")
    with open(caption_output_path, "w") as f:
        f.write(final_caption_text)

    print(f"\nCaptioning complete! Final captions saved to: {caption_output_path}")
    return final_caption_text

In [5]:
process_all_scenes("videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes")

Processing 21 scenes in videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes...
Processing Scene 1 with max_frames=31


qwen-vl-utils using decord to read video.


Processing Scene 2 with max_frames=24
Processing Scene 3 with max_frames=6
Processing Scene 4 with max_frames=9
Processing Scene 5 with max_frames=16
Processing Scene 6 with max_frames=13
Processing Scene 7 with max_frames=6
Processing Scene 8 with max_frames=6
Processing Scene 9 with max_frames=27
Processing Scene 10 with max_frames=12
Processing Scene 11 with max_frames=17
Processing Scene 12 with max_frames=5
Processing Scene 13 with max_frames=32
Processing Scene 14 with max_frames=5
Processing Scene 15 with max_frames=12
Processing Scene 16 with max_frames=5
Processing Scene 17 with max_frames=32
Processing Scene 18 with max_frames=5
Processing Scene 19 with max_frames=29
Processing Scene 20 with max_frames=13
Processing Scene 21 with max_frames=15

Captioning complete! Final captions saved to: videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes/final_captions.txt


'Scene 1: The scene opens with a young man standing on a busy city street at night, looking around nervously. Suddenly, a crowd gathers, and a commotion ensues. The young man is surrounded by people, and a police officer appears, trying to control the situation. The audio reveals that a car jacker has been shot, and paramedics are on the way. The young man seems to be in the middle of the chaos, possibly trying to help or understand what\'s happening.\nScene 2: The scene shifts to a dimly lit room where an older man, presumably Uncle Ben, is sitting in a chair, appearing distressed. A younger man, Peter, is seen comforting him, gently holding his hand. The audio reveals Peter calling out to Uncle Ben, asking him to stay on their way and stay back, indicating a sense of urgency and concern. The interaction between the two suggests a moment of emotional support amidst a tense situation.\nScene 3: The scene shows an older man, presumably Uncle Ben, with a distressed expression, his face c