In [1]:
pip install git+https://github.com/huggingface/transformers accelerate

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-hdfetzgu
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-hdfetzgu
  Resolved https://github.com/huggingface/transformers to commit 62db3e6ed67a74cc1ed1436acd9973915c0a4475
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tokenizers<0.22,>=0.21 (from transformers==4.49.0.dev0)
  Using cached tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.49.

In [2]:
pip install qwen-vl-utils[decord]==0.0.8

Collecting decord (from qwen-vl-utils[decord]==0.0.8)
  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)
Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: decord
Successfully installed decord-0.6.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
!nvidia-smi

Fri Jan 31 19:58:27 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          On  | 00000000:01:00.0 Off |                    0 |
| N/A   32C    P0              51W / 400W |      4MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-40GB          On  | 00000000:81:00.0 Off |  

In [2]:
import torchvision
import transformers
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, GenerationConfig
from qwen_vl_utils import process_vision_info

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

Loading checkpoint shards: 100%|██████████| 5/5 [00:17<00:00,  3.41s/it]


In [16]:
def generate_scene_caption(scene_data, previous_caption=None):
    torch.cuda.empty_cache()

    video_length = scene_data.get("duration")
    fps = 1.0
    max_frames_value = min(32, int(video_length * fps))

    previous_context = ""
    if previous_caption:
        previous_context = f"Previously described:\n{previous_caption}\n\n" \
                           "Do not repeat the same information.\n"

    if scene_data.get("captions"):
        captions_text = "\n".join([f"- {caption}" for caption in scene_data["captions"]])
        scene_context = f"Scene {scene_data['scene_number']} captions:\n{captions_text}\n"
    else:
        scene_context = "" 

    base_prompt = "Describe the video. Keep it concise and do not repeat information from previous scene."

    enhanced_prompt = (
        f"{previous_context}"
        f"{scene_context}\n"
        f"{base_prompt}\n"
    )
    

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": scene_data['video_path'],
                    "max_pixels": 320 * 240,
                    "fps": fps,
                    "max_frames": max_frames_value,
                },
                {"type": "text", "text": enhanced_prompt},
            ],
        }
    ]

    print(f"Processing Scene {scene_data['scene_number']} with max_frames={max_frames_value}")

    dtype = torch.bfloat16

    with torch.amp.autocast('cuda', dtype=dtype):
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )

        inputs = inputs.to("cuda", dtype=dtype)
        input_length = inputs.input_ids.shape[1]

        with torch.no_grad():
            generation_config = GenerationConfig(
                max_new_tokens=128,
                do_sample=False,
                num_beams=1
            )

            generated_ids = model.generate(
                **inputs,
                generation_config=generation_config
            )

        generated_ids_trimmed = generated_ids[:, input_length:]
        output_list = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )

    del inputs, generated_ids, generated_ids_trimmed
    torch.cuda.empty_cache()
    output_text = ''.join(output_list)
    if output_text:
         clean_text = output_text.split("addCriterion")[0].strip()
    return clean_text if output_text else output_text


In [17]:
import os
import json

def process_all_scenes(scene_folder):
    scenes_json_path = os.path.join(scene_folder, "scene_info.json")
    if not os.path.exists(scenes_json_path):
        print(f"Error: scene_info.json not found in {scene_folder}")
        return

    with open(scenes_json_path, "r") as f:
        scene_list = json.load(f)

    print(f"Processing {len(scene_list)} scenes in {scene_folder}...")

    captions = []
    previous_caption = None

    for idx, scene_data in enumerate(scene_list, start=1):
        scene_caption = generate_scene_caption(scene_data, previous_caption=previous_caption)
        captions.append(scene_caption)
        previous_caption = scene_caption

    # Merge captions
    final_caption_text = "\n".join(
        [f"Scene {i+1}: {desc}" for i, desc in enumerate(captions)]
    )

    # Save captions to a text file
    caption_output_path = os.path.join(scene_folder, "final_captions.txt")
    with open(caption_output_path, "w") as f:
        f.write(final_caption_text)

    print(f"\nCaptioning complete! Final captions saved to: {caption_output_path}")
    return final_caption_text


In [18]:
process_all_scenes("videos/rlOywSJnPOU/rlOywSJnPOU_scenes")

Processing 14 scenes in videos/rlOywSJnPOU/rlOywSJnPOU_scenes...
Processing Scene 1 with max_frames=15
Processing Scene 2 with max_frames=32
Processing Scene 3 with max_frames=6
Processing Scene 4 with max_frames=32
Processing Scene 5 with max_frames=6
Processing Scene 6 with max_frames=9
Processing Scene 7 with max_frames=6
Processing Scene 8 with max_frames=13
Processing Scene 9 with max_frames=10
Processing Scene 10 with max_frames=14
Processing Scene 11 with max_frames=5
Processing Scene 12 with max_frames=6
Processing Scene 13 with max_frames=22
Processing Scene 14 with max_frames=32

Captioning complete! Final captions saved to: videos/rlOywSJnPOU/rlOywSJnPOU_scenes/final_captions.txt


"Scene 1: The video showcases a Tuna Melt Sandwich, a classic grilled sandwich with melted cheese, tuna, and vegetables. The sandwich is cut in half and stacked on a wooden cutting board. A hand picks up one half, revealing the gooey, melted cheese and the layers of tuna and vegetables inside. The sandwich is then placed back on the board, emphasizing its appetizing appearance.\nScene 2: The video demonstrates the preparation of ingredients for a sandwich. A hand is shown slicing a tomato into 1cm thick slices on a wooden cutting board. Next, the onion is finely chopped. Celery is also chopped finely, with an option to use cucumber instead. Finally, a jar of cucumber pickles is opened, showcasing the ingredients that will be used in the sandwich.\nScene 3: The video continues the preparation of sandwich ingredients. A hand is shown chopping a green vegetable, likely celery, into small pieces on a wooden cutting board. The caption suggests the person is continuing to chop despite feelin