In [1]:
import torchvision
import transformers
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import hashlib
import requests

import numpy as np
import decord
from decord import VideoReader, cpu
from IPython.display import Markdown, display

In [44]:
def get_video_frames(video_path, cache_dir='.cache'):
    MAX_FRAMES = 128
    os.makedirs(cache_dir, exist_ok=True)

    video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
    frames_cache_file = os.path.join(cache_dir, f'{video_hash}_frames.npy')

    if os.path.exists(frames_cache_file):
        frames = np.load(frames_cache_file)
        return frames  # Return frames directly

    # Get I-frame indices using ffprobe
    cmd = f'ffprobe -select_streams v -show_frames -show_entries frame=pict_type,pkt_pts_time -of csv {video_path}'
    result = subprocess.check_output(cmd, shell=True).decode()

    iframe_indices = []
    frame_idx = 0
    for line in result.split('\n'):
        if 'I' in line:
            iframe_indices.append(frame_idx)
        frame_idx += 1

    iframe_indices = np.array(iframe_indices)
    print(f"Number of i-frames: {len(iframe_indices)}")

    vr = VideoReader(video_path, ctx=cpu(0))
    frames = vr.get_batch(iframe_indices).asnumpy()

    np.save(frames_cache_file, frames)
    
    return frames  


In [3]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, GenerationConfig
from qwen_vl_utils import process_vision_info

model_path = "Qwen/Qwen2.5-VL-7B-Instruct"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_path)

Loading checkpoint shards: 100%|██████████| 5/5 [00:07<00:00,  1.56s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [4]:
from gtts import gTTS
def get_tts_duration(text):
    """Generate TTS and return its duration"""
    if not text or text.isspace():
        return 0.0
        
    with tempfile.NamedTemporaryFile(suffix='.mp3', delete=True) as temp_file:
        tts = gTTS(text=text, lang='en')
        tts.save(temp_file.name)
        cmd = f'ffprobe -v error -select_streams a:0 -show_entries format=duration -of csv="p=0" "{temp_file.name}"'
        duration = float(subprocess.check_output(cmd, shell=True).decode().strip())
        return duration

In [5]:
def generate_scene_descriptions(scene_data, previous_caption=None, max_new_tokens=2048,
                              total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
    """
    Generate audio descriptions with placement recommendations from the VLM
    Returns a list of descriptions with timing info
    
    DCMP Guidelines implemented:
    1. Essential Information: Focus on what's critical for understanding
    2. Timing and Placement: Avoid talking over essential audio
    3. Language and Tense: Present tense, active voice, third-person
    4. Objectivity: Describe without interpretation
    5. Characters: Consistent identification methods
    6. Scene Changes: Clearly convey relevant transitions
    7. Text On Screen: Read essential text consistently
    8. Vocabulary: Clear, concise, age-appropriate language
    9. Sound Effects: Describe unrecognizable but relevant sounds
    10. Visual Details: Shape, size, texture, color when relevant
    """
    scene_path = scene_data.get('scene_path', "unknown_scene_path")
    scene_duration = scene_data.get("duration", 0)
    if scene_duration == 0:
        scene_end = scene_data.get("end_time", 0)
        scene_start = scene_data.get("start_time", 0)
        scene_duration = scene_end - scene_start
    
    # Format transcript data with timing information
    transcripts_info = ""
    if scene_data.get("transcripts"):
        for i, transcript in enumerate(scene_data.get("transcripts")):
            transcripts_info += (f"Dialogue {i+1}: [{transcript['start']:.2f}s - {transcript['end']:.2f}s] "
                               f"{transcript['text']}\n")
    
    # Build the context from previous captions and scene information
    context_parts = []
    if previous_caption:
        context_parts.append(
            f"Background context (for reference only, do not repeat):\n{previous_caption}\n\n"
            "Focus on new observations, actions, and details from the current scene.\n"
        )
    
    if scene_data and scene_data.get("captions"):
        captions_text = "\n".join([f"- {caption}" for caption in scene_data["captions"]])
        context_parts.append(f"Scene {scene_data.get('scene_number', '')} captions:\n{captions_text}\n")
    
    context = "\n".join(context_parts)
    
    # Create prompt with only DCMP audio description guidelines
    prompt = (
        f"{context}\n\n"
        f"SCENE INFORMATION:\n"
        f"- Scene duration: {scene_duration:.2f} seconds\n"
        f"- Dialogue transcript with timing:\n{transcripts_info}\n"
        f"- Captions with timing:\n{captions_info}\n"

        "\nIMPORTANT CLARIFICATION ABOUT TEXT:\n"
        "- Audio descriptors like '[Music]', '[Applause]', '[Laughter]', etc. are NOT on-screen text\n"
        
        f"\nAUDIO DESCRIPTION GUIDELINES:\n"
        "WHAT TO DESCRIBE:\n"
        "1. Describe visual elements clearly and concisely, focusing on what's most essential, labeled as type 'visual'.\n"
        "2. Use brief, impactful descriptions that fit precisely in available silent gaps.\n"
        "3. Capture on-screen text exactly as it appears, labeled as type 'text'.\n"
        "4. Be factual, objective, and precise in your descriptions.\n"
        "5. Use proper terminology and names when they appear in the video.\n"
        "6. Match the tone and mood of the video in your descriptions.\n"
        "7. Use inline description when possible (descriptions that fit naturally within existing silent gaps between dialogue), "
            "and only use extended descriptions when absolutely necessary for critical visual information.\n"
        
        "\nWHAT NOT TO DESCRIBE:\n"
        "1. NEVER place descriptions over dialogue, narration, or important audio.\n"
        "2. Don't describe what viewers can already infer from existing audio cues.\n"
        "3. Avoid over-describing - focus on key visual elements only.\n"
        "4. Don't interpret, analyze, or editorialize about what you see.\n"
        "5. Don't spoil upcoming visual elements or surprises before they happen.\n"
        "6. Don't censor or soften descriptions of content.\n"
        "7. Skip obvious sound effects that are self-explanatory.\n"
        
        "\nIMPORTANT TIMING CONSTRAINTS:\n"
        f"- CRITICAL: Total scene duration is {scene_duration:.2f} seconds.\n"
        "- PRIORITIZE INLINE DESCRIPTIONS: Generate descriptions that fit within existing silent gaps between dialogue and important audio\n"
        "- Descriptions should NEVER overlap with dialogue or important audio\n"
    
        # Task instructions for the VLM
        "\nTASK:\n"
        "1. Analyze the scene and dialogue timing.\n"  # Step 1: Timing analysis
        "2. Identify opportunities for audio description, focusing on silent gaps and moments between essential dialogue.\n"  # Step 2: Gap identification
        "3. For each opportunity, provide:\n"  # Step 3: Description creation
        "   a. The exact timestamp where description should start (in seconds)\n"  # Timing placement
        "   b. An appropriate audio description that fits in the available gap\n"  # Content creation
        "   c. The estimated spoken duration of your description (in seconds)\n"  # Duration estimation
        
        # Format instructions for the output structure
        "\nFormat your response as a JSON list with each description having 'start_time', 'description', and 'duration' fields\n"  # Output format instructions
        "Example: [{\"start_time\": 5.2, \"description\": \"John enters the room, looking nervous.\", \"duration\": 2.5}]\n"  # Example format
    )

    messages = [
            {"role": "system", "content": "You are a professional audio describer following DCMP guidelines."},
            {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"video": scene_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
            ]},
        ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
    fps_inputs = video_kwargs['fps']
    inputs = processor(text=[text], images=image_inputs, videos=video_inputs, fps=fps_inputs,
                         padding=True, return_tensors="pt")
    
    inputs = inputs.to('cuda')
    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        top_p=0.9
    )
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    response = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
    
    # Process the VLM's response to extract descriptions with timing
    description_list = []
    try:
        # Extract JSON from the response (in case there's additional text)
        import re
        json_pattern = r'\[.*\]'
        json_match = re.search(json_pattern, response, re.DOTALL)
        
        if json_match:
            json_str = json_match.group(0)
            description_list = json.loads(json_str)
        else:
            print("Could not find JSON format in the model's response. Falling back to text parsing.")
            
            # Try to parse the response as text if JSON extraction fails
            descriptions = []
            for line in response.split('\n'):
                if "start_time" in line and "description" in line:
                    # Simple text parsing as fallback
                    parts = line.split(':', 2)
                    if len(parts) >= 3:
                        time_part = parts[1].split(',')[0].strip()
                        desc_part = parts[2].strip()
                        try:
                            start_time = float(time_part)
                            descriptions.append({
                                "start_time": start_time,
                                "description": desc_part,
                                "duration": get_tts_duration(desc_part)
                            })
                        except ValueError:
                            pass
            
            if descriptions:
                description_list = descriptions
    except Exception as e:
        print(f"Error parsing VLM response: {str(e)}")
        print(f"Raw response: {response}")
    
    # Validate and refine descriptions
    validated_descriptions = []
    for desc in description_list:
        # Verify required fields exist
        if not all(key in desc for key in ['start_time', 'description']):
            continue
            
        # Get actual TTS duration if not provided or verify provided duration
        if 'duration' not in desc or abs(desc['duration'] - get_tts_duration(desc['description'])) > 0.5:
            try:
                desc['duration'] = get_tts_duration(desc['description'])
            except Exception as e:
                print(f"Error getting TTS duration: {str(e)}")
                desc['duration'] = len(desc['description'].split()) * 0.3  # Rough estimate
                
        # Validate timing is within scene boundaries
        if desc['start_time'] < 0 or desc['start_time'] + desc['duration'] > scene_duration:
            print(f"Warning: Description with timing {desc['start_time']}s exceeds scene boundaries. Adjusting.")
            desc['start_time'] = max(0, min(desc['start_time'], scene_duration - desc['duration']))
            
        validated_descriptions.append(desc)
    
    # Sort descriptions by start time
    validated_descriptions.sort(key=lambda x: x['start_time'])
    
    return validated_descriptions

In [6]:
import json
def get_scene_data(scene_number, scenes_json_path):
    with open(scenes_json_path, 'r') as f:
        scenes = json.load(f)

    for scene in scenes:
        if scene["scene_number"] == scene_number:
            return scene

    raise ValueError(f"Scene {scene_number} not found in {scenes_json_path}")


In [64]:
scene_1_data = get_scene_data(1,f"videos/_1DDhUnyvwY/_1DDhUnyvwY_scenes/scene_info.json")
scene_1_data

{'scene_number': 1,
 'start_frame': 0,
 'end_frame': 324,
 'start_time': 0.0,
 'end_time': 12.96,
 'duration': 12.96,
 'scene_path': 'videos/_1DDhUnyvwY/_1DDhUnyvwY_scenes/scene_001.mp4',
 'transcript': []}

In [75]:
scene_1_caption = generate_scene_descriptions(scene_1_data)

[buffer @ 0x55b2465b5b40] Unable to parse option value "-1" as pixel format
[buffer @ 0x55b2465b5b40] Unable to parse option value "-1" as pixel format
[buffer @ 0x55b2465b5b40] Error setting option pix_fmt to value -1.
[in @ 0x55b24c279840] Error applying options to the filter.
video_reader_backend decord error, use torchvision as default, msg: [16:13:32] /github/workspace/src/video/ffmpeg/filter_graph.cc:61: Check failed: avfilter_graph_create_filter(&buffersrc_ctx_, buffersrc, "in", args, __null, filter_graph_.get()) >= 0 (-22 vs. 0) Cannot create buffer source




In [76]:
scene_1_caption

[{'start_time': 0.0,
  'description': 'The video begins with a view of Earth from space, showing the continents and oceans. The camera slowly zooms in on Africa, highlighting its geographical features.',
  'duration': 11.904},
 {'start_time': 0,
  'description': 'The camera pans across the African landscape, showcasing the diverse terrain, including mountains and valleys. The coastline is visible, with the ocean meeting the land in a dramatic fashion.',
  'duration': 13.656},
 {'start_time': 0.19200000000000017,
  'description': "The camera continues to zoom in on Africa, focusing on the continent's vast landscapes and terrain. It captures the greenery of the land and the blue expanse of the ocean surrounding it.",
  'duration': 12.768}]

In [61]:
generate_scene_caption(scene_2_frames, scene_2_data, previous_caption=scene_1_caption)

'Peter Parker rushes to help an injured Uncle Ben, expressing concern and urgency.'

In [11]:
import os
import argparse
import subprocess
import hashlib
import json
import numpy as np
import torch
import tempfile
import re
import gc
import time

In [12]:
def process_all_scenes(video_folder):
    video_id = os.path.basename(video_folder)
    video_metadata_path = os.path.join(video_folder, f"{video_id}.json")
    scenes_folder = os.path.join(video_folder, f"{video_id}_scenes")
    scenes_json_path = os.path.join(scenes_folder, "scene_info.json")

    if not os.path.exists(video_metadata_path):
        print(f"Error: {video_metadata_path} not found. Unable to retrieve title and description.")
        return

    with open(video_metadata_path, "r") as f:
        video_metadata = json.load(f)

    video_title = video_metadata.get("title", "Unknown Title")
    video_description = video_metadata.get("description", "")
    previous_caption = f"Video Title: {video_title}\n{video_description}\n"
    print(previous_caption)
    
    if not os.path.exists(scenes_json_path):
        print(f"Error: scene_info.json not found in {scenes_folder}")
        return

    with open(scenes_json_path, "r") as f:
        scene_list = json.load(f)

    print(f"Processing {len(scene_list)} scenes in {scenes_folder}...")

    for idx, scene_data in enumerate(scene_list, start=1):
        print(f"\nProcessing Scene {idx}: {scene_data.get('scene_number', 'Unknown')}")
          
        #video_frames = get_video_frames(scene_data['scene_path'])
        
        # Generate descriptions with VLM-recommended timing
        descriptions = generate_scene_descriptions(
            scene_data,
            previous_caption
        )
        
        if descriptions:
            print(f"Generated {len(descriptions)} descriptions:")
            for i, desc in enumerate(descriptions):
                print(f"  {i+1}. [{desc['start_time']:.2f}s] ({desc['duration']:.2f}s): {desc['description']}")
                
            # Store descriptions in scene data
            scene_data["audio_descriptions"] = descriptions
            
            # Combine all descriptions for contextual continuity in next scene
            previous_caption += "\n".join([d['description'] for d in descriptions]) + "\n"
        else:
            print("No descriptions generated for this scene")
            scene_data["audio_descriptions"] = []

    with open(scenes_json_path, "w") as f:
        json.dump(scene_list, f, indent=4)

    print(f"\nScene descriptions updated in: {scenes_json_path}")
    return scene_list

In [13]:
process_all_scenes("videos/_1DDhUnyvwY")

[buffer @ 0x564f8278b9c0] Unable to parse option value "-1" as pixel format
[buffer @ 0x564f8278b9c0] Unable to parse option value "-1" as pixel format
[buffer @ 0x564f8278b9c0] Error setting option pix_fmt to value -1.
[in @ 0x564f93ec2800] Error applying options to the filter.
video_reader_backend decord error, use torchvision as default, msg: [16:53:52] /github/workspace/src/video/ffmpeg/filter_graph.cc:61: Check failed: avfilter_graph_create_filter(&buffersrc_ctx_, buffersrc, "in", args, __null, filter_graph_.get()) >= 0 (-22 vs. 0) Cannot create buffer source


Video Title: jane goodall


Processing 4 scenes in videos/_1DDhUnyvwY/_1DDhUnyvwY_scenes...

Processing Scene 1: 1
Generated 3 descriptions:
  1. [0.00s] (12.05s): The video begins with a rotating globe, showcasing the Earth from space. The camera zooms in on Africa, highlighting its vast landmass and surrounding oceans.
  2. [0.65s] (12.31s): The camera pans across the terrain, revealing lush greenery and rugged mountains. The coastline is visible, with the ocean's deep blue waters contrasting against the green land.
  3. [2.00s] (9.19s): The view transitions to a detailed satellite image of a region in Africa, focusing on the landscape and geographical features.

Processing Scene 2: 2
Generated 3 descriptions:
  1. [0.00s] (12.00s): The video opens with a rotating globe, showcasing the Earth from space. The camera zooms in on Africa, highlighting its vast landmass and surrounding oceans.
  2. [2.69s] (12.31s): The camera pans across the terrain, revealing lush greenery and rugged mou

[{'scene_number': 1,
  'start_frame': 0,
  'end_frame': 324,
  'start_time': 0.0,
  'end_time': 12.96,
  'duration': 12.96,
  'scene_path': 'videos/_1DDhUnyvwY/_1DDhUnyvwY_scenes/scene_001.mp4',
  'transcript': [],
  'audio_descriptions': [{'start_time': 0.0,
    'description': 'The video begins with a rotating globe, showcasing the Earth from space. The camera zooms in on Africa, highlighting its vast landmass and surrounding oceans.',
    'duration': 12.048},
   {'start_time': 0.6480000000000015,
    'description': "The camera pans across the terrain, revealing lush greenery and rugged mountains. The coastline is visible, with the ocean's deep blue waters contrasting against the green land.",
    'duration': 12.312},
   {'start_time': 2.0,
    'description': 'The view transitions to a detailed satellite image of a region in Africa, focusing on the landscape and geographical features.',
    'duration': 9.192}]},
 {'scene_number': 2,
  'start_frame': 324,
  'end_frame': 699,
  'start_t

: 