In [1]:
import torchvision
import transformers
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import hashlib
import requests

import numpy as np
import decord
from decord import VideoReader, cpu

In [3]:
import subprocess

In [4]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, GenerationConfig
from qwen_vl_utils import process_vision_info

model_path = "Qwen/Qwen2.5-VL-7B-Instruct"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_path)

Loading checkpoint shards: 100%|██████████| 5/5 [00:04<00:00,  1.19it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [5]:
def get_video_frames(video_path, cache_dir='.cache'):
    MAX_FRAMES = 128
    os.makedirs(cache_dir, exist_ok=True)

    video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
    frames_cache_file = os.path.join(cache_dir, f'{video_hash}_frames.npy')

    '''if os.path.exists(frames_cache_file):
        frames = np.load(frames_cache_file)
        return frames  # Return frames directly'''

    # Get I-frame indices using ffprobe
    cmd = f'ffprobe -select_streams v -show_frames -show_entries frame=pict_type,pkt_pts_time -of csv {video_path}'
    result = subprocess.check_output(cmd, shell=True).decode()

    iframe_indices = []
    frame_idx = 0
    for line in result.split('\n'):
        if 'I' in line:
            iframe_indices.append(frame_idx)
        frame_idx += 1

    iframe_indices = np.array(iframe_indices)
    print(f"Number of i-frames: {len(iframe_indices)}")

    if len(iframe_indices) > MAX_FRAMES:
        selected_indices = np.linspace(0, len(iframe_indices) - 1, num=MAX_FRAMES, dtype=int)
        iframe_indices = iframe_indices[selected_indices]
    print(f"Number of i-frames: {len(iframe_indices)}")
    vr = VideoReader(video_path, ctx=cpu(0))
    frames = vr.get_batch(iframe_indices).asnumpy()

    np.save(frames_cache_file, frames)
    
    return frames  


In [6]:
def inference(frames, prompt, max_new_tokens=2048):
    if isinstance(frames, np.ndarray):
        frames = torch.tensor(frames, dtype=torch.float32).to('cuda')

    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"video": frames},  # Provide frames directly
            ]
        },
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], videos=frames, padding=True, return_tensors="pt").to('cuda')

    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return output_text[0]


In [7]:
#video_path = "videos/rlOywSJnPOU/rlOywSJnPOU_scenes/scene_002.mp4"
video_path = "videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes/scene_015.mp4"
prompt = "Describe the scene."

frames = get_video_frames(video_path)

ffprobe version 7.1 Copyright (c) 2007-2024 the FFmpeg developers
  built with gcc 13.3.0 (conda-forge gcc 13.3.0-1)
  configuration: --prefix=/home/do.ng/.conda/envs/CS7980_YouDescribe --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --enable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --disable-gnutls --enable-libmp3lame --enable-libvpx --enable-libass --enable-pthreads --enable-alsa --enable-libpulse --enable-vaapi --enable-libopenvino --enable-gpl --enable-libx264 --en

Number of i-frames: 12
Number of i-frames: 12


In [8]:
from IPython.display import Markdown, display
response = inference(frames, prompt)
display(Markdown(response))

The scene appears to be set in a dimly lit, possibly abandoned or old building. The lighting is low, creating a tense and eerie atmosphere. A young man with short dark hair, wearing a red shirt, is seen in close-up shots. His expression is serious and somewhat concerned as he looks at someone off-camera. The background includes a window with bars and some indistinct objects that suggest an older, possibly neglected environment. The overall mood of the scene is suspenseful and dramatic.

In [17]:
def generate_scene_caption(frames, scene_data=None, previous_caption=None,max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
    # Build context from scene_data if available
    context_parts = []
    
    
    if previous_caption:
        context_parts.append(
            f"Background context (for reference only, do not repeat):\n{previous_caption}\n\n"
            "Focus on new observations, actions, and details from the current scene.\n"
        )
        

    if scene_data and scene_data.get("captions"):
        captions_text = "\n".join([f"- {caption}" for caption in scene_data["captions"]])
        context_parts.append(f"Scene {scene_data['scene_number']} captions:\n{captions_text}\n")

    # Add audio context if available
    if scene_data and scene_data.get("transcripts"):
        audio_text = "\n".join([f"- {transcript_text}" for transcript_text in scene_data["transcripts"]])    
        context_parts.append(
            f"Audio transcript for Scene {scene_data['scene_number']}:\n{audio_text}\n"
        )
        '''
            "When describing the scene, combine what you observe visually with the context provided in the audio transcript. "
            "Use the audio information to enrich your understanding of the events and actions happening in the scene. "
            "Create a natural description that weaves together both visual and audio elements.\n"
        )
        '''
        

    # Build final prompt
    context = "\n".join(context_parts)
    prompt = (
        f"{context}\n\n"
        "Understand the scene by integrating visual and audio context.\n"
        "Describe the scene with rich context using the fewest syllables possible."
    )

    #print(prompt)

    if isinstance(frames, np.ndarray):
        frames = torch.tensor(frames, dtype=torch.float32).to('cuda')

    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"video": frames},  # Provide frames directly
            ]
        },
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], videos=frames, padding=True, return_tensors="pt").to('cuda')

    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return output_text[0]

In [10]:
import json
def get_scene_data(scene_number, scenes_json_path):
    with open(scenes_json_path, 'r') as f:
        scenes = json.load(f)

    for scene in scenes:
        if scene["scene_number"] == scene_number:
            return scene

    raise ValueError(f"Scene {scene_number} not found in {scenes_json_path}")


In [18]:
scene_1_data = get_scene_data(1, f"videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes/scene_info.json")
#scene_2_data = get_scene_data(2,f"videos/rlOywSJnPOU/rlOywSJnPOU_scenes/scene_info.json")

In [19]:
video_frames_path_1 = get_video_frames(scene_1_data['video_path'])

ffprobe version 7.1 Copyright (c) 2007-2024 the FFmpeg developers
  built with gcc 13.3.0 (conda-forge gcc 13.3.0-1)
  configuration: --prefix=/home/do.ng/.conda/envs/CS7980_YouDescribe --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --enable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --disable-gnutls --enable-libmp3lame --enable-libvpx --enable-libass --enable-pthreads --enable-alsa --enable-libpulse --enable-vaapi --enable-libopenvino --enable-gpl --enable-libx264 --en

Number of i-frames: 45
Number of i-frames: 45


In [20]:
scene_1_caption = generate_scene_caption(video_frames_path_1, scene_1_data)

In [22]:
scene_1_caption

'A carjacking occurs at night in a busy city street. A crowd gathers as police arrive. Peter Parker, a young man, rushes to help an injured man, Uncle Ben, who is lying on the ground. The scene captures the urgency and concern of the moment.'

In [23]:
scene_2_data = get_scene_data(2, f"videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes/scene_info.json")
video_frames_path_2 = get_video_frames(scene_2_data['video_path'])

ffprobe version 7.1 Copyright (c) 2007-2024 the FFmpeg developers
  built with gcc 13.3.0 (conda-forge gcc 13.3.0-1)
  configuration: --prefix=/home/do.ng/.conda/envs/CS7980_YouDescribe --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --enable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --disable-gnutls --enable-libmp3lame --enable-libvpx --enable-libass --enable-pthreads --enable-alsa --enable-libpulse --enable-vaapi --enable-libopenvino --enable-gpl --enable-libx264 --en

Number of i-frames: 20
Number of i-frames: 20


In [24]:
generate_scene_caption(video_frames_path_2, scene_2_data, previous_caption=scene_1_caption)

'Peter Parker rushes to help an injured Uncle Ben, expressing concern and urgency.'

In [72]:
def process_all_scenes(scene_folder):
    scenes_json_path = os.path.join(scene_folder, "scene_info.json")
    if not os.path.exists(scenes_json_path):
        print(f"Error: scene_info.json not found in {scene_folder}")
        return

    with open(scenes_json_path, "r") as f:
        scene_list = json.load(f)

    print(f"Processing {len(scene_list)} scenes in {scene_folder}...")

    captions = []
    previous_caption = None

    for idx, scene_data in enumerate(scene_list, start=1):
        video_frames_path = get_video_frames(scene_data['video_path'])
        scene_caption = generate_scene_caption(video_frames_path,scene_data,previous_caption=previous_caption)
        print(scene_caption)
        captions.append(scene_caption)
        previous_caption = scene_caption

    # Merge captions
    final_caption_text = "\n".join(
        [f"Scene {i+1}: {desc}" for i, desc in enumerate(captions)]
    )

    # Save captions to a text file
    caption_output_path = os.path.join(scene_folder, "final_captions.txt")
    with open(caption_output_path, "w") as f:
        f.write(final_caption_text)

    print(f"\nCaptioning complete! Final captions saved to: {caption_output_path}")
    return final_caption_text

In [73]:
process_all_scenes("videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes")

Processing 8 scenes in videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes...
A carjacking occurs at night in a busy city street. A crowd gathers as police arrive. Peter Parker, a young man, rushes to help an injured man, Uncle Ben, who is lying on the ground. The scene captures the urgency and concern of the moment.
Peter Parker rushes to help an injured Uncle Ben, expressing concern and urgency.
Peter Parker cries as he rushes to help an injured Uncle Ben, with three cars pursuing him.
Spider-Man climbs up building, leaps across rooftops, descends into alleyway.


ffprobe version 7.1 Copyright (c) 2007-2024 the FFmpeg developers
  built with gcc 13.3.0 (conda-forge gcc 13.3.0-1)
  configuration: --prefix=/home/do.ng/.conda/envs/CS7980_YouDescribe --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --enable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --disable-gnutls --enable-libmp3lame --enable-libvpx --enable-libass --enable-pthreads --enable-alsa --enable-libpulse --enable-vaapi --enable-libopenvino --enable-gpl --enable-libx264 --en

Number of i-frames: 8
Spider-Man swings through city at night.


ffprobe version 7.1 Copyright (c) 2007-2024 the FFmpeg developers
  built with gcc 13.3.0 (conda-forge gcc 13.3.0-1)
  configuration: --prefix=/home/do.ng/.conda/envs/CS7980_YouDescribe --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --enable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --disable-gnutls --enable-libmp3lame --enable-libvpx --enable-libass --enable-pthreads --enable-alsa --enable-libpulse --enable-vaapi --enable-libopenvino --enable-gpl --enable-libx264 --en

Number of i-frames: 216
Spider-Man swings through the city at night, dodging traffic and navigating urban obstacles.


ffprobe version 7.1 Copyright (c) 2007-2024 the FFmpeg developers
  built with gcc 13.3.0 (conda-forge gcc 13.3.0-1)
  configuration: --prefix=/home/do.ng/.conda/envs/CS7980_YouDescribe --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --enable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --disable-gnutls --enable-libmp3lame --enable-libvpx --enable-libass --enable-pthreads --enable-alsa --enable-libpulse --enable-vaapi --enable-libopenvino --enable-gpl --enable-libx264 --en

Number of i-frames: 10
Police officers search building at night, flashlight beams illuminating dark interior.


ffprobe version 7.1 Copyright (c) 2007-2024 the FFmpeg developers
  built with gcc 13.3.0 (conda-forge gcc 13.3.0-1)
  configuration: --prefix=/home/do.ng/.conda/envs/CS7980_YouDescribe --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1739478357321/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --enable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --disable-gnutls --enable-libmp3lame --enable-libvpx --enable-libass --enable-pthreads --enable-alsa --enable-libpulse --enable-vaapi --enable-libopenvino --enable-gpl --enable-libx264 --en

Number of i-frames: 10
Spider-Man movie poster. Red background. Text advertises digital, Blu-ray, and DVD release. "Subscribe" and "The hottest trailers!" prompts.

Captioning complete! Final captions saved to: videos/n9nC8liwZ5Y/n9nC8liwZ5Y_scenes/final_captions.txt


'Scene 1: A carjacking occurs at night in a busy city street. A crowd gathers as police arrive. Peter Parker, a young man, rushes to help an injured man, Uncle Ben, who is lying on the ground. The scene captures the urgency and concern of the moment.\nScene 2: Peter Parker rushes to help an injured Uncle Ben, expressing concern and urgency.\nScene 3: Peter Parker cries as he rushes to help an injured Uncle Ben, with three cars pursuing him.\nScene 4: Spider-Man climbs up building, leaps across rooftops, descends into alleyway.\nScene 5: Spider-Man swings through city at night.\nScene 6: Spider-Man swings through the city at night, dodging traffic and navigating urban obstacles.\nScene 7: Police officers search building at night, flashlight beams illuminating dark interior.\nScene 8: Spider-Man movie poster. Red background. Text advertises digital, Blu-ray, and DVD release. "Subscribe" and "The hottest trailers!" prompts.'