# 1. Installing Qwen2-VL

In [1]:
!pip install qwen-vl-utils



# 2. Video Describe - Testing scene 11

In [5]:
video_path = "videos/rlOywSJnPOU/rlOywSJnPOU_scenes/scene_011.mp4"

In [1]:
import torchvision
import transformers
import torch

print(transformers.__version__)

  from .autonotebook import tqdm as notebook_tqdm


4.45.0.dev0


In [2]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from transformers.generation import GenerationConfig

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

# generation confi
model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

Loading checkpoint shards: 100%|██████████| 5/5 [00:05<00:00,  1.13s/it]


In [4]:
import json
def get_scene_data(scene_number, scenes_json_path):
    with open(scenes_json_path, 'r') as f:
        scenes = json.load(f)

    for scene in scenes:
        if scene["scene_number"] == scene_number:
            return scene

    raise ValueError(f"Scene {scene_number} not found in {scenes_json_path}")

In [11]:
scene_11_data = get_scene_data(11, f"videos/rlOywSJnPOU/rlOywSJnPOU_scenes/scene_info.json")

In [42]:
def query_video(prompt, scene_data):
    torch.cuda.empty_cache()

    video_length = scene_data.get("duration")

    fps = 1.0
    max_frames_value = min(32, int(video_length * fps)) 
    

    captions_text = "\n".join([f"- {caption}" for caption in scene_data["captions"]])
    scene_context = f"Scene {scene_data['scene_number']} captions:\n{captions_text}"
    enhanced_prompt = f"{scene_context}\n{prompt}"

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": scene_data['video_path'],
                    "max_pixels": 320 * 240,
                    "fps": 1.0,
                    "max_frames": max_frames_value,
                },
                {"type": "text", "text": enhanced_prompt},
            ],
        }
    ]

    print(f"Processing Scene {scene_data['scene_number']}")

    dtype = torch.bfloat16

    with torch.amp.autocast('cuda', dtype=dtype):
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )

        inputs = inputs.to("cuda", dtype=dtype)
        input_length = inputs.input_ids.shape[1]

        with torch.no_grad():
            generation_config = GenerationConfig(
                max_new_tokens=128,
                do_sample=False,
                num_beams=1
            )

            generated_ids = model.generate(
                **inputs,
                generation_config=generation_config
            )

        generated_ids_trimmed = generated_ids[:, input_length:]

        output_text = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )

    del inputs, generated_ids, generated_ids_trimmed
    torch.cuda.empty_cache()

    return output_text

In [31]:
query_video("describe the video in detail", scene_data=scene_11_data)

Processing Scene 11


['The video demonstrates the process of preparing a tuna salad. It starts with opening a can of tuna and draining the oil. The tuna is then combined with finely chopped onion, pickles, celery, mayonnaise, crushed red pepper, and lemon juice. The ingredients are mixed together to create a creamy, flavorful salad. The video provides a step-by-step guide to making a delicious and healthy tuna salad, suitable for a quick lunch or snack.']

In [32]:
query_video("Was a tool used to open the can of tuna? If yes, what was the tool?",
            scene_data=scene_11_data)

Processing Scene 11


['Yes, a tool was used to open the can of tuna. The tool used was a can opener.']

In [33]:
query_video("Was a tool used to transfer the tuna to the bowl? If yes, what was the tool?",
            scene_data=scene_11_data)

Processing Scene 11


['Yes, a tool was used to transfer the tuna to the bowl. The tool used was a strainer.']

In [34]:
query_video("List all the items that are added to the bowl and their quantities", scene_data)

Processing Scene 11


['The items added to the bowl and their quantities are:\n\n- 150g Tuna\n- 35g Chopped Onion\n- 35g Chopped Pickle\n- 35g Chopped Celery\n- 60g Mayonnaise\n- 2g Crushed Red Pepper\n- 1 tbsp Lemon Juice\n- 0.5 tsp Black Pepper']

# 3. Video Understanding - All Scenes

In [28]:
def generate_scene_caption(scene_data, previous_caption=None):
    torch.cuda.empty_cache()

    video_length = scene_data.get("duration", 5.0)
    fps = 1.0
    max_frames_value = max(2,min(32, int(video_length * fps)))

    previous_context = ""
    if previous_caption:
        previous_context = f"Previously described:\n{previous_caption}\n\n" \
                           "Do not repeat the same information.\n"

    captions_text = "\n".join([f"- {caption}" for caption in scene_data.get("captions", [])])
    scene_context = f"Scene {scene_data['scene_number']} captions:\n{captions_text}"

    base_prompt = "Describe the video. Keep it concise and do not repeat information from previous scene."

    enhanced_prompt = (
        f"{previous_context}"
        f"{scene_context}\n"
        f"{base_prompt}\n"
    )

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": scene_data['video_path'],
                    "max_pixels": 320 * 240,
                    "fps": fps,
                    "max_frames": max_frames_value,
                },
                {"type": "text", "text": enhanced_prompt},
            ],
        }
    ]

    print(f"Processing Scene {scene_data['scene_number']} with max_frames={max_frames_value}")

    dtype = torch.bfloat16

    with torch.amp.autocast('cuda', dtype=dtype):
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )

        inputs = inputs.to("cuda", dtype=dtype)
        input_length = inputs.input_ids.shape[1]

        with torch.no_grad():
            generation_config = GenerationConfig(
                max_new_tokens=128,
                do_sample=False,
                num_beams=1
            )

            generated_ids = model.generate(
                **inputs,
                generation_config=generation_config
            )

        generated_ids_trimmed = generated_ids[:, input_length:]
        output_list = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )

    del inputs, generated_ids, generated_ids_trimmed
    torch.cuda.empty_cache()

    return output_list[0] if output_list else ""


In [32]:
import os
import json

def process_all_scenes(scene_folder):
    scenes_json_path = os.path.join(scene_folder, "scene_info.json")
    if not os.path.exists(scenes_json_path):
        print(f"Error: scene_info.json not found in {scene_folder}")
        return

    with open(scenes_json_path, "r") as f:
        scene_list = json.load(f)

    print(f"Processing {len(scene_list)} scenes in {scene_folder}...")

    captions = []
    previous_caption = None

    for idx, scene_data in enumerate(scene_list, start=1):
        scene_caption = generate_scene_caption(scene_data, previous_caption=previous_caption)
        captions.append(scene_caption)
        previous_caption = scene_caption

    # Merge captions
    final_caption_text = "\n".join(
        [f"Scene {i+1}: {desc}" for i, desc in enumerate(captions)]
    )

    # Save captions to a text file
    caption_output_path = os.path.join(scene_folder, "final_captions.txt")
    with open(caption_output_path, "w") as f:
        f.write(final_caption_text)

    print(f"\nCaptioning complete! Final captions saved to: {caption_output_path}")
    return final_caption_text


In [33]:
process_all_scenes("videos/rlOywSJnPOU/rlOywSJnPOU_scenes")

Processing 32 scenes in videos/rlOywSJnPOU/rlOywSJnPOU_scenes...
Processing Scene 1 with max_frames=6
Processing Scene 2 with max_frames=9
Processing Scene 3 with max_frames=2
Processing Scene 4 with max_frames=5
Processing Scene 5 with max_frames=12
Processing Scene 6 with max_frames=2
Processing Scene 7 with max_frames=9
Processing Scene 8 with max_frames=2
Processing Scene 9 with max_frames=3
Processing Scene 10 with max_frames=6
Processing Scene 11 with max_frames=32
Processing Scene 12 with max_frames=3
Processing Scene 13 with max_frames=2
Processing Scene 14 with max_frames=2
Processing Scene 15 with max_frames=7
Processing Scene 16 with max_frames=6
Processing Scene 17 with max_frames=3
Processing Scene 18 with max_frames=5
Processing Scene 19 with max_frames=4
Processing Scene 20 with max_frames=10
Processing Scene 21 with max_frames=4
Processing Scene 22 with max_frames=7
Processing Scene 23 with max_frames=3
Processing Scene 24 with max_frames=5
Processing Scene 25 with max_

"Scene 1: The video showcases a delicious Tuna Melt sandwich. The sandwich is made with two slices of bread, a layer of melted cheese, and a filling of tuna mixed with tomatoes and other ingredients. The sandwich is placed on a wooden cutting board, and a hand is seen holding one half of the sandwich, revealing the inside layers. The video emphasizes the appealing presentation and the mouth-watering appearance of the sandwich, making it look very appetizing.\nScene 2: The video features a Tuna Melt sandwich with melted cheese, tomatoes, and a tuna mixture. It is placed on a wooden cutting board, and a hand holds one half, showcasing the sandwich's layers. The video highlights the sandwich's appealing appearance.\nScene 3: The video showcases a tomato placed on a wooden cutting board.\nScene 4: In the video, a person is cutting a tomato on a wooden cutting board. The tomato is being sliced into 1cm thick pieces.\nScene 5: In the video, a person is cutting a tomato and an onion on a wood