### Video Understanding with Qwen2.5-VL

In this notebook, we delve into the capabilities of the **Qwen2.5-VL** model for video understanding tasks. Our objective is to showcase how this advanced model can be applied to various video analysis scenarios, ranging from basic OCR to complex event detection and summarization.


#### \[Setup\]

We start by loading the pre-trained `Qwen2_5_VLForConditionalGeneration` model. This model has been fine-tuned on a diverse set of video understanding tasks, enabling it to generate detailed and accurate descriptions based on visual inputs.


In [None]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

model_path = "Qwen/Qwen2.5-VL-7B-Instruct"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_path)

In [None]:
!pip install eva-decord

Load video frames and timestamps

In [1]:
# import os
# import math
# import hashlib
# import requests

# from IPython.display import Markdown, display
# import numpy as np
# from PIL import Image
# import decord
# from decord import VideoReader, cpu


# def download_video(url, dest_path):
#     response = requests.get(url, stream=True)
#     with open(dest_path, 'wb') as f:
#         for chunk in response.iter_content(chunk_size=8096):
#             f.write(chunk)
#     print(f"Video downloaded to {dest_path}")


# def get_video_frames(video_path, num_frames=128, cache_dir='.cache'):
#     os.makedirs(cache_dir, exist_ok=True)

#     video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
#     if video_path.startswith('http://') or video_path.startswith('https://'):
#         video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
#         if not os.path.exists(video_file_path):
#             download_video(video_path, video_file_path)
#     else:
#         video_file_path = video_path

#     frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
#     timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')

#     if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
#         frames = np.load(frames_cache_file)
#         timestamps = np.load(timestamps_cache_file)
#         return video_file_path, frames, timestamps

#     vr = VideoReader(video_file_path, ctx=cpu(0))
#     total_frames = len(vr)

#     indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
#     frames = vr.get_batch(indices).asnumpy()
#     timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])

#     np.save(frames_cache_file, frames)
#     np.save(timestamps_cache_file, timestamps)
    
#     return video_file_path, frames, timestamps


# def create_image_grid(images, num_columns=8):
#     pil_images = [Image.fromarray(image) for image in images]
#     num_rows = math.ceil(len(images) / num_columns)

#     img_width, img_height = pil_images[0].size
#     grid_width = num_columns * img_width
#     grid_height = num_rows * img_height
#     grid_image = Image.new('RGB', (grid_width, grid_height))

#     for idx, image in enumerate(pil_images):
#         row_idx = idx // num_columns
#         col_idx = idx % num_columns
#         position = (col_idx * img_width, row_idx * img_height)
#         grid_image.paste(image, position)

#     return grid_image

import os
import glob
import hashlib
import re
import numpy as np
from PIL import Image

def get_video_frames_nuscenes(scene_prefix, frames_dir, camera='CAM_FRONT', num_frames=128, cache_dir='.cache', fps=12):
    """
    Load video frames from nuScenes sequential images.
    
    Args:
        scene_prefix (str): Scene identifier prefix (e.g., 'n015-2018-11-21-19-38-26+0800')
        frames_dir (str): Directory containing the image frames
        camera (str): Camera name (default: 'CAM_FRONT')
        num_frames (int): Number of frames to sample (default: 128)
        cache_dir (str): Directory for caching processed frames (default: '.cache')
        fps (float): Frame rate for timestamp calculation (default: 12)
    
    Returns:
        tuple: (scene_path, frames, timestamps)
            - scene_path: Path to the frames directory
            - frames: numpy array of shape (num_frames, height, width, channels)
            - timestamps: numpy array of timestamps for each frame
    """
    os.makedirs(cache_dir, exist_ok=True)
    
    # Create a unique hash for this scene and camera combination
    scene_hash = hashlib.md5(f"{scene_prefix}_{camera}_{frames_dir}".encode('utf-8')).hexdigest()
    
    frames_cache_file = os.path.join(cache_dir, f'{scene_hash}_{num_frames}_frames.npy')
    timestamps_cache_file = os.path.join(cache_dir, f'{scene_hash}_{num_frames}_timestamps.npy')
    
    # Check if cached files exist
    if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
        frames = np.load(frames_cache_file)
        timestamps = np.load(timestamps_cache_file)
        return frames_dir, frames, timestamps
    
    # Find all image files matching the scene prefix and camera
    pattern = os.path.join(frames_dir, f"{scene_prefix}__{camera}__*.jpg")
    image_files = glob.glob(pattern)
    
    if not image_files:
        raise ValueError(f"No images found for scene '{scene_prefix}' and camera '{camera}' in directory '{frames_dir}'")
    
    # Extract timestamps from filenames and sort
    file_timestamp_pairs = []
    for img_path in image_files:
        filename = os.path.basename(img_path)
        # Extract timestamp from filename pattern: prefix__camera__timestamp.jpg
        match = re.match(rf"{re.escape(scene_prefix)}__{re.escape(camera)}__(\d+)\.jpg", filename)
        if match:
            timestamp = int(match.group(1))
            file_timestamp_pairs.append((img_path, timestamp))
    
    if not file_timestamp_pairs:
        raise ValueError(f"No valid timestamps found in filenames for scene '{scene_prefix}'")
    
    # Sort by timestamp
    file_timestamp_pairs.sort(key=lambda x: x[1])
    
    total_frames = len(file_timestamp_pairs)
    
    # Sample frames at evenly spaced intervals
    if num_frames >= total_frames:
        # If we want more frames than available, use all frames
        indices = list(range(total_frames))
    else:
        # Sample evenly across the sequence
        indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
    
    # Load the selected frames
    frames = []
    timestamps = []
    
    for idx in indices:
        img_path, timestamp_microseconds = file_timestamp_pairs[idx]
        
        # Load and convert image
        img = Image.open(img_path)
        img_array = np.array(img)
        frames.append(img_array)
        
        # Convert timestamp from microseconds to seconds (as tuples like the original function)
        timestamp_seconds = timestamp_microseconds / 1_000_000.0
        timestamps.append((timestamp_seconds, timestamp_seconds))  # (start, end) format like original
    
    frames = np.array(frames)
    timestamps = np.array(timestamps)
    
    # Cache the results
    np.save(frames_cache_file, frames)
    np.save(timestamps_cache_file, timestamps)
    
    return frames_dir, frames, timestamps

Inference function

In [6]:
def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
            ]
        },
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
    fps_inputs = video_kwargs['fps']
    print("video input:", video_inputs[0].shape)
    num_frames, _, resized_height, resized_width = video_inputs[0].shape
    print("num of video tokens:", int(num_frames / 2 * resized_height / 28 * resized_width / 28))
    inputs = processor(text=[text], images=image_inputs, videos=video_inputs, fps=fps_inputs, padding=True, return_tensors="pt")
    inputs = inputs.to('cuda')

    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return output_text[0]

Inference function with API using OpenAI SDK.

**Important Notice:**
- Please be aware that the current API supports video processing up to a maximum length of 10 minutes.
- Currently, the model inference interface does not support configuring the resolution of video frames. Therefore, it is recommended to resize videos with higher resolutions and longer durations to a smaller resolution to ensure that the input sequence is not excessively long. We advise keeping the number of video tokens under 24k to achieve better video grounding results.

In [30]:
import os
from openai import OpenAI
from IPython.display import Markdown, display


def inference_with_api(
    video_path,
    prompt,
    sys_prompt = "You are a helpful assistant.",
    model_id = "qwen/qwen-vl-max",
    # model_id="qwen/qwen2.5-vl-72b-instruct:free",
):
    client = OpenAI(
        api_key=os.getenv('OPENROUTER_API_KEY'),
        base_url="https://openrouter.ai/api/v1",
    )    
    messages = [
        {
            "role": "system",
            "content": [{"type":"text","text": sys_prompt}]
        },
        {
            "role": "user",
            "content": [
                {"type": "video_url", "video_url": {"url": video_path}},
                {"type": "text", "text": prompt},
            ]
        }
    ]
    completion = client.chat.completions.create(
        model = model_id,
        messages = messages,
    )
    print("video url:", video_path)
    print(completion)
    return completion.choices[0].message.content

#### 1. Reading Text in Videos

In this section, we demonstrate how the model can be used to recognize and summarize text within a video. Specifically, we'll use a video containing various products and ask the model to summarize their characteristics in a structured format.


In [8]:
import cv2
import os
import glob

def convert_nuscenes_to_mp4(scene_prefix, frames_dir, output_path, camera='CAM_FRONT', fps=12):
    """
    Convert nuScenes frame sequence to MP4 video
    """
    # Get all frames for the scene
    pattern = os.path.join(frames_dir, f"{scene_prefix}__{camera}__*.jpg")
    image_files = sorted(glob.glob(pattern))
    
    if not image_files:
        raise ValueError(f"No images found for scene '{scene_prefix}'")
    
    # Read first image to get dimensions
    first_img = cv2.imread(image_files[0])
    height, width, layers = first_img.shape
    
    # Define codec and create VideoWriter
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    # Write frames to video
    for img_path in image_files:
        frame = cv2.imread(img_path)
        video_writer.write(frame)
    
    video_writer.release()
    return output_path

# Usage
scene_prefix = "n008-2018-08-01-15-16-36-0400"
frames_dir = "/Users/lilyzhang/Desktop/Qwen2.5-VL/v1.0-mini/sweeps/CAM_FRONT"
output_video = f"/Users/lilyzhang/Desktop/Qwen2.5-VL/v1.0-mini/videos/{scene_prefix}.mp4"

# Convert to MP4
convert_nuscenes_to_mp4(scene_prefix, frames_dir, output_video, fps=12)


'/Users/lilyzhang/Desktop/Qwen2.5-VL/v1.0-mini/videos/n008-2018-08-01-15-16-36-0400.mp4'

In [32]:
def upload_to_github_and_get_url():
    """
    Helper function to generate GitHub raw URL
    You'll need to manually upload the file to GitHub first
    """
    # Format: https://raw.githubusercontent.com/user/repo/branch/path
    # Example implementation - you'd need to replace with your actual details
    
    username = "lilyzhng"
    repo_name = "Qwen2.5-VL" 
    branch = "benchmark-hard-examples"
    file_path = "v1.0-mini/videos/n008-2018-08-01-15-16-36-0400.mp4"  # path within repo
    
    raw_url = f"https://raw.githubusercontent.com/{username}/{repo_name}/{branch}/{file_path}"
    return raw_url

# Usage
raw_url = upload_to_github_and_get_url()

# video_url = "https://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/datasets/cookbook/ead2e3f0e7f836c9ec51236befdaf2d843ac13a6.mp4"

# video_url = "https://github.com/lilyzhng/Qwen2.5-VL/blob/ba8a0f136c56310cdd2ed2fad9bef7c37a48fcba/v1.0-mini/videos/n008-2018-08-01-15-16-36-0400.mp4"

video_url = "https://raw.githubusercontent.com/lilyzhng/Qwen2.5-VL/ba8a0f136c56310cdd2ed2fad9bef7c37a48fcba/v1.0-mini/videos/n008-2018-08-01-15-16-36-0400.mp4"
# github_url = "https://raw.githubusercontent.com/yourusername/nuscenes-videos/main/n008-2018-08-01-15-16-36-0400.mp4"
# prompt = "Localize a series of activity events in the video, output the start and end timestamp for each event, and describe each event with sentences. Provide the result in json format with ‘mm:ss.ff’ format for time depiction."

# prompt = "Localize different video driving behaviors in the 16 seconds video, output the start and end timestamp for each event, and describe each event with sentences. Provide the result in json format with ‘mm:ss.ff’ format for time depiction."
prompt = "Could you go into detail about the content this video?"
response = inference_with_api(video_path = video_url, prompt=prompt)



video url: https://raw.githubusercontent.com/lilyzhng/Qwen2.5-VL/ba8a0f136c56310cdd2ed2fad9bef7c37a48fcba/v1.0-mini/videos/n008-2018-08-01-15-16-36-0400.mp4
ChatCompletion(id='gen-1751263895-gKm2pbR6ANVo1LCNAdLH', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="I'm sorry, but I'm not able to provide information about a specific video without more context. Could you please provide me with the title of the video, the name of the creator or channel, or any other relevant details? This will help me better understand which video you are referring to and provide you with more accurate information.", refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None, reasoning=None), native_finish_reason='stop')], created=1751263895, model='qwen/qwen-vl-max', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=66, prompt_tokens=28, total_tokens=94, comp

In [35]:
import os
from openai import OpenAI

def debug_video_api_call(video_url, prompt):
    """Enhanced debugging for video API calls"""
    
    client = OpenAI(
        api_key=os.getenv('OPENROUTER_API_KEY'),
        base_url="https://openrouter.ai/api/v1",
    )
    
    # Try different message formats
    formats_to_try = [
        # Format 1: video_url type
        {
            "role": "user",
            "content": [
                {"type": "video_url", "video_url": {"url": video_url}},
                {"type": "text", "text": prompt},
            ]
        },
        # Format 2: video type (might work better)
        {
            "role": "user", 
            "content": [
                {"type": "video", "video": video_url},
                {"type": "text", "text": prompt},
            ]
        }
    ]
    
    for i, user_message in enumerate(formats_to_try):
        print(f"\n--- Trying Format {i+1} ---")
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            user_message
        ]
        
        try:
            completion = client.chat.completions.create(
                model="qwen/qwen-vl-max",
                messages=messages,
            )
            
            response = completion.choices[0].message.content
            print(f"Response: {response[:200]}...")
            
            # Check if response seems to actually describe video content
            if any(word in response.lower() for word in ['video', 'scene', 'frame', 'motion', 'visual', 'driving', 'car', 'road']):
                print("✅ This format seems to work!")
                return response
            else:
                print("❌ Generic response - format might not work")
                
        except Exception as e:
            print(f"❌ Error with format {i+1}: {e}")
    
    return None

# Test with your working URL
video_url = "https://raw.githubusercontent.com/lilyzhng/Qwen2.5-VL/ba8a0f136c56310cdd2ed2fad9bef7c37a48fcba/v1.0-mini/videos/n008-2018-08-01-15-16-36-0400.mp4"
prompt = "Analyze this nuScenes driving video. Describe the road scene, vehicles, pedestrians, traffic signs, and any notable events you observe. Be specific about what you see."

result = debug_video_api_call(video_url, prompt)


--- Trying Format 1 ---
Response: To provide a detailed analysis of a nuScenes driving video, I would typically need to view the specific video in question. However, since the video itself isn't provided, I can offer a general analysi...
✅ This format seems to work!


In [37]:
client = OpenAI(
    api_key=os.getenv('OPENROUTER_API_KEY'),
    base_url="https://openrouter.ai/api/v1",
)

# Try with a known working video URL first
test_video_urls = [
    "http://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/cookbook/video_ocr.mp4",  # Small test video
     "https://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/datasets/cookbook/ead2e3f0e7f836c9ec51236befdaf2d843ac13a6.mp4", #test
    video_url  # Your GitHub URL
]

for i, test_url in enumerate(test_video_urls):
    print(f"\n--- Testing Video {i+1}: {test_url[:50]}... ---")
    
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                {"type": "video_url", "video_url": {"url": test_url}},
                {"type": "text", "text": "Describe what you see in this video."},
            ]
        }
    ]
    
    try:
        completion = client.chat.completions.create(
            model="qwen/qwen-vl-max",
            messages=messages,
        )
        response = completion.choices[0].message.content
        
        if "video itself isn't provided" in response or "I would typically need" in response:
            print("❌ API can't access this video")
        else:
            print("✅ API successfully processed video!")
            print(f"Response: {response[:200]}...")
            
    except Exception as e:
        print(f"❌ Error: {e}")


--- Testing Video 1: http://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.... ---
✅ API successfully processed video!
Response: I'm sorry, but as an AI language model, I don't have the ability to see or access videos. Can you please provide me with more information or a description of the video you are referring to? I'll do my...

--- Testing Video 2: https://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu... ---
✅ API successfully processed video!
Response: I'm sorry, but as an AI language model, I don't have the ability to see or access videos. Can you please provide me with more information or a description of the video you are referring to? I'll do my...

--- Testing Video 3: https://raw.githubusercontent.com/lilyzhng/Qwen2.5... ---
✅ API successfully processed video!
Response: I'm sorry, but as an AI language model, I don't have the ability to see or access videos. Can you please provide me with more information or a description of the video you are referring to? I'll do my...


In [None]:
video_url = "https://duguang-labelling.oss-cn-shanghai.aliyuncs.com/qiansun/video_ocr/videos/50221078283.mp4"
prompt = "请用表格总结一下视频中的商品特点"

## Use a local HuggingFace model to inference.
video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
# image_grid = create_image_grid(frames, num_columns=8)
# display(image_grid.resize((640, 640)))

response = inference(video_path, prompt)
display(Markdown(response))

In [38]:
## Use API for inference. Apply API key here: https://bailian.console.alibabacloud.com/?apiKey=1
# os.environ['DASHSCOPE_API_KEY'] = 'your_api_key_here' 

video_url = "http://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/cookbook/video_ocr.mp4"
prompt = "Watch the video and list the paper titles in a table, add one extra column for translating the paper titles to Chinese."
os.environ['OPENROUTER_API_KEY'] ='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'
response = inference_with_api(video_url, prompt)
display(Markdown(response))

video url: http://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/cookbook/video_ocr.mp4
ChatCompletion(id='gen-1751264312-3ZDVXOXS3vNCcWjGT5By', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Since you haven\'t provided a specific video for me to watch, I\'ll create a hypothetical example based on common research paper titles in the field of artificial intelligence. Here\'s a table listing the paper titles along with their Chinese translations:\n\n| Original Paper Title                                      | Chinese Translation                                |\n|-----------------------------------------------------------|----------------------------------------------------|\n| "Attention is All You Need"                               | "注意力是你所需要的全部"                           |\n| "Deep Residual Learning for Image Recognition"            | "用于图像识别的深度残差学习"                       |\n| "Generative Adversarial Networks" 

Since you haven't provided a specific video for me to watch, I'll create a hypothetical example based on common research paper titles in the field of artificial intelligence. Here's a table listing the paper titles along with their Chinese translations:

| Original Paper Title                                      | Chinese Translation                                |
|-----------------------------------------------------------|----------------------------------------------------|
| "Attention is All You Need"                               | "注意力是你所需要的全部"                           |
| "Deep Residual Learning for Image Recognition"            | "用于图像识别的深度残差学习"                       |
| "Generative Adversarial Networks"                         | "生成对抗网络"                                     |
| "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding" | "BERT：用于语言理解的深度双向变压器预训练"         |
| "Adam: A Method for Stochastic Optimization"              | "Adam：一种随机优化方法"                           |
| "ImageNet Classification with Deep Convolutional Neural Networks" | "使用深度卷积神经网络进行ImageNet分类"             |
| "Long Short-Term Memory"                                  | "长短期记忆网络"                                   |
| "Playing Atari with Deep Reinforcement Learning"          | "使用深度强化学习玩Atari游戏"                      |
| "Auto-Encoding Variational Bayes"                         | "自动编码变分贝叶斯"                               |
| "Very Deep Convolutional Networks for Large-Scale Image Recognition" | "用于大规模图像识别的非常深的卷积网络"             |

### Explanation:
- **Original Paper Title**: This column lists the titles of the research papers in English.
- **Chinese Translation**: This column provides the Chinese translation of the corresponding paper titles.

If you have a specific video in mind, please share it, and I can create a table based on the actual content of the video.

#### 2. Long Video Understanding

Next, we explore the model's capability to comprehend extremely long videos, such as those lasting up to one hour. This demonstrates how the model can effectively process and analyze extended video content, extracting meaningful insights over longer durations.

To reduce the number of visual tokens generated from a long video, you can specify the `resized_height` and `resized_width` parameters. These settings allow the video frames to be resized to a smaller dimension, effectively decreasing the computational load while maintaining the essential visual information needed for analysis.


In [None]:
video_url = "https://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/evaluations/data/LVBench/videos/GcRKREorGSc.mp4"
prompt = "Could you go into detail about the content of this long video?"

video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
# image_grid = create_image_grid(frames, num_columns=8)
# display(image_grid.resize((640, 640)))

response = inference(video_path, prompt)
display(Markdown(response))


#### 3. Video Grounding

This part focuses on answering specific questions about a video segment. We specify a textual query and ask the model what is the period that the described content occur in the video, showcasing the model's ability to understand timestamps and search the detailed queries.

In [None]:
video_url = "https://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/datasets/cookbook/ead2e3f0e7f836c9ec51236befdaf2d843ac13a6.mp4"
prompt = "Give the query: 'seasoning the steak', when does the described content occur in the video?"

video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
image_grid = create_image_grid(frames, num_columns=8)
display(image_grid.resize((640, 640)))

# inference
response = inference(video_path, prompt)
display(Markdown(response))

In [None]:
## Use API for inference. Apply API key here: https://bailian.console.alibabacloud.com/?apiKey=1
# os.environ['DASHSCOPE_API_KEY'] = 'your_api_key_here' 

video_url = "http://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/cookbook/video_structured_caption_480p.mov"
prompt = "Give the query: 'The seasoned meat is placed on a grill', when does the described content occur in the video? Use ‘mm:ss.ff’ as time format."

response = inference_with_api(video_url, prompt)
display(Markdown(response))

#### 4. Structured Video Captioning

Finally, we present a scenario where the model identifies significant events within the video, providing start and end timestamps for each event along with descriptive sentences. The output is formatted in JSON for easy parsing and further processing.


In [20]:
video_url = "https://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/datasets/cookbook/ead2e3f0e7f836c9ec51236befdaf2d843ac13a6.mp4"
prompt = "Localize a series of activity events in the video, output the start and end timestamp for each event, and describe each event with sentences. Provide the result in json format with 'mm:ss.ff' format for time depiction."

video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
# image_grid = create_image_grid(frames, num_columns=8)
# display(image_grid.resize((640, 640)))

In [None]:
# inference
response = inference(video_path, prompt)
display(Markdown(response))

- By post-processing the json results, we can intuitively present video clips and descriptions in an interleaved manner.

In [22]:
import json
import markdown
from bs4 import BeautifulSoup
from datetime import datetime


def parse_json(response):
    html = markdown.markdown(response, extensions=['fenced_code'])
    soup = BeautifulSoup(html, 'html.parser')
    json_text = soup.find('code').text

    data = json.loads(json_text)
    return data


def time_to_seconds(time_str):
    time_obj = datetime.strptime(time_str, '%M:%S.%f')
    total_seconds = time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000
    return total_seconds


In [None]:
data = parse_json(response)

for item in data:
    start_time = item["start_time"]
    end_time = item["end_time"]
    description = item["description"]

    display(Markdown(f"**{start_time} - {end_time}:**\t\t" + description))

    start_time = time_to_seconds(start_time)
    end_time = time_to_seconds(end_time)
    current_frames = []
    for frame, timestamp in zip(frames, timestamps):
        if timestamp[0] > start_time and timestamp[1] < end_time:
            current_frames.append(frame)
    
    current_frames = np.array(current_frames)
    current_image_grid = create_image_grid(current_frames, num_columns=8)

    display(current_image_grid.resize((480, (int(len(current_frames) / 8) + 1) * 60)))


In [None]:
## Use API for inference. Apply API key here: https://bailian.console.alibabacloud.com/?apiKey=1
# os.environ['DASHSCOPE_API_KEY'] = 'your_api_key_here' 

video_url = "http://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/cookbook/video_structured_caption_480p.mov"
prompt = "Localize a series of activity events in the video, output the start and end timestamp for each event, and describe each event with sentences. Provide the result in json format with ‘mm:ss.ff’ format for time depiction."

response = inference_with_api(video_url, prompt)
display(Markdown(response))