In [1]:
import torchvision
import transformers
import torch

In [2]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, GenerationConfig
from qwen_vl_utils import process_vision_info

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [3]:
video_path = "videos/rlOywSJnPOU/rlOywSJnPOU_scenes/scene_002.mp4"
output_path = "videos/rlOywSJnPOU/frames"

In [3]:
!ffmpeg -i {video_path} -vf "select='eq(pict_type,PICT_TYPE_I)'" -vsync vfr {output_path}/keyframes.mp4


ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 13.3.0 (conda-forge gcc 13.3.0-1)
  configuration: --prefix=/home/do.ng/.conda/envs/CS7980_YouDescribe --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1728332263724/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1728332263724/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1728332263724/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1728332263724/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --enable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --disable-gnutls --enable-libmp3lame --enable-libvpx --enable-libass --enable-pthreads --enable-vaapi --enable-libopenvino --enable-gpl --enable-libx264 --enable-libx265 --enable-libaom --en

In [3]:
!ffmpeg -i {video_path} -vf "select='eq(pict_type,PICT_TYPE_I)'" -vsync vfr -q:v 2 {output_path}/'%05d.jpg'


ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 13.3.0 (conda-forge gcc 13.3.0-1)
  configuration: --prefix=/home/do.ng/.conda/envs/CS7980_YouDescribe --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1728332263724/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1728332263724/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1728332263724/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1728332263724/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --enable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --disable-gnutls --enable-libmp3lame --enable-libvpx --enable-libass --enable-pthreads --enable-vaapi --enable-libopenvino --enable-gpl --enable-libx264 --enable-libx265 --enable-libaom --en

In [6]:
import os

def get_frame_list(frames_path):
    frame_files = sorted(
        [os.path.join(frames_path, f) for f in os.listdir(frames_path) if f.endswith('.jpg')]
    )
    return frame_files  # Return list of image paths


In [13]:
import torch.backends.cuda
import numpy as np
import time

def generate_scene_caption(video_path="videos/rlOywSJnPOU/keyframes.mp4", 
                         frames_path="videos/rlOywSJnPOU/frames", 
                         use_frames=False, 
                         batch_size=32,
                         max_frames=32,
                         prompt=None):  
    
    start_time = time.time()
    torch.cuda.empty_cache()
    torch.backends.cuda.enable_mem_efficient_sdp(True)

    dtype = torch.bfloat16
    output_list = []

    if use_frames:
        all_frames = get_frame_list(frames_path)
        total_frames = len(all_frames) 
        
        # Uniform sampling
        if max_frames < total_frames:
            indices = np.linspace(0, total_frames - 1, max_frames, dtype=int)
            selected_frames = [all_frames[i] for i in indices]
        else:
            selected_frames = all_frames
            
        print(f"Processing {len(selected_frames)} frames from {total_frames} total frames")

        effective_batch_size = min(batch_size, len(selected_frames))
        num_batches = (len(selected_frames) + effective_batch_size - 1) // effective_batch_size
        print(f"Processing in {num_batches} batches of max size {effective_batch_size}")

        with torch.no_grad():
            with torch.amp.autocast('cuda', dtype=dtype):
                for segment in range(num_batches):
                    start_idx = segment * effective_batch_size
                    end_idx = min(start_idx + effective_batch_size, len(selected_frames))
                    batch_frames = selected_frames[start_idx:end_idx]

                    if not batch_frames:
                        continue

                    messages = [
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "video",
                                    "video": batch_frames,  
                                    "max_pixels": 320 * 240,  
                                },
                                {"type": "text", "text": prompt},
                            ],
                        }
                    ]

                    text = processor.apply_chat_template(
                        messages, tokenize=False, add_generation_prompt=True
                    )
                    image_inputs, video_inputs = process_vision_info(messages)
                    inputs = processor(
                        text=[text],
                        images=image_inputs,
                        videos=video_inputs,
                        padding=True,
                        return_tensors="pt",
                    ).to("cuda", dtype=dtype)

                    input_length = inputs.input_ids.shape[1]
                    generation_config = GenerationConfig(
                        max_new_tokens=512,  
                        do_sample=False,
                        num_beams=1
                    )
                    generated_ids = model.generate(
                        **inputs,
                        generation_config=generation_config
                    )

                    generated_ids_trimmed = generated_ids[:, input_length:]
                    batch_output = processor.batch_decode(
                        generated_ids_trimmed,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=False
                    )

                    output_list.extend(batch_output)
                    print(f"Completed batch {segment + 1}/{num_batches}")
                    del inputs, generated_ids, generated_ids_trimmed

    else:
        with torch.no_grad():
            with torch.amp.autocast('cuda', dtype=dtype):
                messages = [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "video",
                                "video": video_path,
                                "max_pixels": 320 * 240,
                                "max_frame": max_frames,  
                            },
                            {"type": "text", "text": prompt},
                        ],
                    }
                ]

                text = processor.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
                image_inputs, video_inputs = process_vision_info(messages)
                inputs = processor(
                    text=[text],
                    images=image_inputs,
                    videos=video_inputs,
                    padding=True,
                    return_tensors="pt",
                ).to("cuda", dtype=dtype)

                input_length = inputs.input_ids.shape[1]
                generation_config = GenerationConfig(
                    max_new_tokens=512,  # Reduced from 256
                    do_sample=False,
                    num_beams=1
                )
                generated_ids = model.generate(
                    **inputs,
                    generation_config=generation_config
                )

                generated_ids_trimmed = generated_ids[:, input_length:]
                batch_output = processor.batch_decode(
                    generated_ids_trimmed,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=False
                )

                output_list.extend(batch_output)
                del inputs, generated_ids, generated_ids_trimmed

    torch.cuda.empty_cache()
    
    processing_time = time.time() - start_time
    print(f"Total processing time: {processing_time:.2f} seconds")

    final_caption = " ".join(output_list).strip()
    return final_caption if final_caption else ""

In [7]:
generate_scene_caption(use_frames=True, max_frames=32, prompt = "Describe the video.")

Processing 32 frames from 99 total frames
Processing in 1 batches of max size 32
Completed batch 1/1
Total processing time: 17.55 seconds


'The video shows a person preparing ingredients for a dish. The sequence begins with a close-up of a hand holding a tomato, which is then sliced into thin rounds on a wooden cutting board. The text "Tomato | 토마토" appears on the screen, indicating the ingredient being used. Next, the person slices an onion into small pieces, with the text "Onion | 양파" displayed. Following this, the person chops celery into small, uniform pieces, with the text "Celery | 셀러리(오이가능)" appearing. Finally, the person places a small glass bowl containing a green pickle on the cutting board. The video focuses on the detailed process of chopping and preparing the ingredients, showcasing the precision and care taken in the preparation.'

In [9]:
generate_scene_caption(use_frames=True, batch_size=56,max_frames=56, prompt = "Describe the video.")

Processing 56 frames from 99 total frames
Processing in 1 batches of max size 56
Completed batch 1/1
Total processing time: 23.23 seconds


'The video showcases a step-by-step process of preparing ingredients for a dish. It begins with a close-up of a hand placing a tomato on a wooden cutting board. The tomato is then sliced into thick rounds, with the text "Tomato | 토마토" appearing on the screen, indicating the ingredient and its Korean name. The hand continues to slice the tomato into smaller, even pieces, with the text "토마토를 1cm 두께로 잘라준다" (Cut the tomato into 1cm thick slices) providing instructions.\n\nNext, the scene transitions to the preparation of an onion. The hand is shown peeling and then finely chopping the onion, with the text "Onion | 양파" and "양파를 다져준다" (Chop the onion) appearing on the screen.\n\nFollowing this, the video shows the chopping of celery. The hand slices the celery into small, uniform pieces, with the text "Celery | 셀러리(오이가능)" and "잘게 다져준다" (Chop finely) displayed.\n\nFinally, the video shows a hand holding a small glass jar filled with cucumber pickles, with the text "Cucumber pickle | 오이피클" app

In [11]:
generate_scene_caption(use_frames=True, batch_size=32, max_frames=99, prompt = "Describe the video.")

Processing 99 frames from 99 total frames
Processing in 4 batches of max size 32
Completed batch 1/4
Completed batch 2/4
Completed batch 3/4
Completed batch 4/4
Total processing time: 34.21 seconds


'The video begins with a close-up of a wooden cutting board. A hand places a fresh, red tomato on the board. The hand then picks up a knife and begins to slice the tomato into thin, even rounds. The knife glides smoothly through the tomato, creating a pile of neatly sliced pieces. The video captures the process of slicing the tomato, emphasizing the precision and care taken in the task. The background is a simple, checkered cloth, which provides a clean and uncluttered setting for the activity. The video is likely part of a cooking tutorial or a food preparation demonstration. The video shows a person preparing ingredients for a dish. The first frame shows a hand slicing a tomato on a wooden cutting board. The text on the screen indicates that the tomato is being cut into slices for a sandwich. The second frame shows the same hand chopping an onion on the same cutting board. The text on the screen identifies the ingredient as "Onion" and mentions that it is being chopped. The third fra

In [12]:
generate_scene_caption(use_frames=True, batch_size=56, max_frames=99, prompt = "Describe the video.")

Processing 99 frames from 99 total frames
Processing in 2 batches of max size 56
Completed batch 1/2
Completed batch 2/2
Total processing time: 24.53 seconds


'The video begins with a close-up of a wooden cutting board. A hand places a fresh tomato on the board and proceeds to slice it into thick rounds. The text "Tomato | 토마토" appears on the screen, indicating the ingredient being used. The hand carefully cuts the tomato into even slices, revealing its juicy interior with visible seeds and a vibrant red color. The text "토마토를 1cm 두께로 잘라준다" (Cut the tomato into 1cm thick slices) appears, providing instructions for the viewer.\n\nNext, the hand picks up an onion and begins to chop it into smaller pieces. The text "Onion | 양파" appears, identifying the new ingredient. The hand skillfully slices the onion, creating a pile of finely chopped pieces. The video demonstrates the process of preparing ingredients, likely for a recipe, with clear and concise instructions. The video shows a person preparing ingredients for a dish. The first frame shows a hand using a knife to finely chop a white onion on a wooden cutting board. The next frame transitions 

In [8]:
generate_scene_caption(use_frames=False, max_frames=32, prompt = "Describe the video.")

qwen-vl-utils using decord to read video.


Total processing time: 34.58 seconds


'The video showcases a step-by-step process of preparing ingredients for a dish. It begins with a close-up of a hand placing a whole tomato on a wooden cutting board. The tomato is then sliced into thick rounds, with the text "Tomato | 토마토" appearing on the screen. The person proceeds to cut the tomato slices into smaller, even pieces, as indicated by the text "토마토를 1cm 두께로 잘라준다" (Cut the tomato into 1cm thick slices). Next, an onion is shown, and the person finely chops it, with the text "Onion | 양파" and "양파를 다져준다" (Chop the onion) appearing on the screen. Following this, the person slices and finely chops celery, with the text "Celery | 셀러리(오이가능)" and "잘게 다져준다" (Chop finely) displayed. Finally, a jar of cucumber pickle is shown, with the text "Cucumber pickle | 오이피클" appearing on the screen. The video focuses on the detailed preparation of these ingredients, emphasizing the precision and care taken in each step.'

In [10]:
generate_scene_caption(use_frames=False, max_frames=56, prompt = "Describe the video.")

Total processing time: 35.48 seconds


'The video showcases a step-by-step process of preparing ingredients for a dish. It begins with a close-up of a hand placing a whole tomato on a wooden cutting board. The tomato is then sliced into thick rounds, with the text "Tomato | 토마토" appearing on the screen. The person proceeds to cut the tomato slices into smaller, even pieces, as indicated by the text "토마토를 1cm 두께로 잘라준다" (Cut the tomato into 1cm thick slices). Next, an onion is shown, and the person finely chops it, with the text "Onion | 양파" and "양파를 다져준다" (Chop the onion) appearing on the screen. Following this, the person slices and finely chops celery, with the text "Celery | 셀러리(오이가능)" and "잘게 다져준다" (Chop finely) displayed. Finally, a jar of cucumber pickle is shown, with the text "Cucumber pickle | 오이피클" appearing on the screen. The video focuses on the detailed preparation of these ingredients, emphasizing the precision and care taken in each step.'

In [14]:
generate_scene_caption(use_frames=True, max_frames=32, prompt = "Describe the video with rich context using the fewest syllables possible.")

Processing 32 frames from 99 total frames
Processing in 1 batches of max size 32
Completed batch 1/1
Total processing time: 21.42 seconds


'Tomato sliced. Onion chopped. Celery diced. Cucumber pickle added.'

In [15]:
generate_scene_caption(use_frames=True, batch_size=56,max_frames=56, prompt = "Describe the video with rich context using the fewest syllables possible.")

Processing 56 frames from 99 total frames
Processing in 1 batches of max size 56
Completed batch 1/1
Total processing time: 30.84 seconds


'Cut tomato, onion, celery, cucumber pickle.'

In [16]:
generate_scene_caption(use_frames=True, batch_size=56,max_frames=99, prompt = "Describe the video with rich context using the fewest syllables possible.")

Processing 99 frames from 99 total frames
Processing in 2 batches of max size 56
Completed batch 1/2
Completed batch 2/2
Total processing time: 48.94 seconds


'Tomato sliced. Onion chopped. Cut onion. Chop celery. Open cucumber pickle jar.'

In [17]:
generate_scene_caption(use_frames=True, batch_size=32,max_frames=99, prompt = "Describe the video with rich context using the fewest syllables possible.")

Processing 99 frames from 99 total frames
Processing in 4 batches of max size 32
Completed batch 1/4
Completed batch 2/4
Completed batch 3/4
Completed batch 4/4
Total processing time: 32.62 seconds


'Tomato sliced. Tomatoes sliced. Onion chopped. Celery placed. Cutting celery into small pieces. Hand pours pickled cucumbers onto cutting board.'

In [18]:
generate_scene_caption(use_frames=False, max_frames=32, prompt = "Describe the video with rich context using the fewest syllables possible.")

Total processing time: 34.35 seconds


'Cut tomato, onion, celery, cucumber pickle.'

In [19]:
generate_scene_caption(use_frames=False, max_frames=56, prompt = "Describe the video with rich context using the fewest syllables possible.")

Total processing time: 27.74 seconds


'Cut tomato, onion, celery, cucumber pickle.'