## Cookbook for Video Understanding with Seed1.5-VL

Seed1.5-VL provides strong video understanding abilities. Here are a few simple examples to quickly show you how to leverage the Seed1.5-VL for video understanding.

In [None]:
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: Apache-2.0
from typing import Optional
from enum import Enum
import os
import base64
import shutil

import cv2
import numpy as np
from openai import OpenAI

### 0. Setup the environment

In [None]:
# Please set the API key here
seed_vl_version = "doubao-1-5-thinking-vision-pro-250428"
client = OpenAI(
    base_url="https://ark.cn-beijing.volces.com/api/v3",
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [7]:
# 定义抽帧策略枚举类
class Strategy(Enum):
    # 固定间隔抽帧策略，例如每1秒抽一帧
    CONSTANT_INTERVAL = "constant_interval"
    # 均匀间隔抽帧策略，根据设定的最大帧数均匀从视频全长度抽取
    EVEN_INTERVAL = "even_interval"

Video processing code from [the doc of Volcengine](https://www.volcengine.com/docs/82379/1362931#%E7%A4%BA%E4%BE%8B%E4%BB%A3%E7%A0%81-2)

In [18]:
def preprocess_video(
        video_file_path: str,
        output_dir: str,
        extraction_strategy: Optional[Strategy] = Strategy.EVEN_INTERVAL,
        interval_in_seconds: Optional[float] = 1,
        max_frames: Optional[int] = 10,
        use_timestamp: bool = True,
        keyframe_naming_template: str = "frame_{:04d}.jpg",
) -> list[str]:
    """将视频按照指定策略抽帧
    参数:
        video_file_path (str): 视频文件路径
        output_dir (str): 输出目录
        extraction_strategy (Optional[Strategy], optional): 抽帧策略。
             固定间隔 比如 1s 抽一帧 或
             均匀间隔 根据设定的最大帧数 均匀从视频全长度均匀抽取
             默认固定间隔 1s 抽一帧
        interval_in_seconds (Optional[float], optional): 固定间隔抽帧的间隔时间. 默认 1s 抽一帧
        max_frames (Optional[int], optional): 最大抽帧帧数. 默认 10 帧
        use_timestamp (bool): 是否输出视频时间戳, 默认True
        keyframe_naming_template (_type_, optional): 抽帧图片命名模板
    返回:
        list[str]: 抽帧图片路径列表
        list[float]: 视频采样帧对应的时间戳
    """
    # 检查输出目录是否存在，如果不存在则创建
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # 使用OpenCV打开视频文件
    cap = cv2.VideoCapture(video_file_path)
    # 获取视频的帧率
    fps = cap.get(cv2.CAP_PROP_FPS)
    # 获取视频的总帧数
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # 根据策略选择抽帧间隔
    if extraction_strategy == Strategy.CONSTANT_INTERVAL:
        # 计算固定间隔抽帧的帧间隔
        frame_interval = int(fps * interval_in_seconds)
    elif extraction_strategy == Strategy.EVEN_INTERVAL:
        # 计算均匀间隔抽帧的帧间隔
        frame_interval = int(length / max_frames)
    else:
        # 如果策略无效，抛出异常
        raise ValueError("Invalid extraction strategy")
    # 初始化帧计数器
    frame_count = 0
    # 初始化关键帧列表
    keyframes = []
    timestamps = []
    # 循环读取视频帧
    while True:
        # 读取一帧
        ret, frame = cap.read()
        # 如果读取失败，跳出循环
        if not ret:
            break
        # 如果当前帧是关键帧
        if frame_count % frame_interval == 0:
            # 生成关键帧的文件名
            image_path = os.path.join(
                output_dir, keyframe_naming_template.format(len(keyframes))
            )
            # 将关键帧保存为图片
            cv2.imwrite(
                image_path,
                frame,
            )
            # 将关键帧路径添加到列表中
            keyframes.append(image_path)
            timestamps.append(round(frame_count / fps, 1))
        # 增加帧计数器
        frame_count += 1
        # 如果关键帧数量达到最大值，跳出循环
        if len(keyframes) >= max_frames:
            break

    print("抽取帧数:", len(keyframes))
    # 返回关键帧路径列表
    if use_timestamp:
        return keyframes, timestamps
    return keyframes, None

def resize(image):
    """
    调整图片大小以适应指定的尺寸。
    参数:
        image (numpy.ndarray): 输入的图片，格式为numpy数组。
    返回:
        numpy.ndarray: 调整大小后的图片。
    """
    # 获取图片的原始高度和宽度
    height, width = image.shape[:2]
    # 根据图片的宽高比确定目标尺寸
    if height < width:
        target_height, target_width = 480, 640
    else:
        target_height, target_width = 640, 480
    # 如果图片尺寸已经小于或等于目标尺寸，则直接返回原图片
    if height <= target_height and width <= target_width:
        return image
    # 计算新的高度和宽度，保持图片的宽高比
    if height / target_height < width / target_width:
        new_width = target_width
        new_height = int(height * (new_width / width))
    else:
        new_height = target_height
        new_width = int(width * (new_height / height))
    # 调整图片大小
    return cv2.resize(image, (new_width, new_height))

# 定义方法将指定路径图片resize到合适大小并转为Base64编码
def encode_image(image_path: str) -> str:
    """
    将指定路径的图片进行编码
    参数:
        image_path (str): 图片文件的路径
    返回:
        str: 编码后的图片字符串
    """
    # 读取图片
    image = cv2.imread(image_path)
    # 调整图片大小
    image_resized = resize(image)
    # 将图片编码为JPEG格式
    _, encoded_image = cv2.imencode(".jpg", image_resized)
    # 将编码后的图片转换为Base64字符串
    return base64.b64encode(encoded_image).decode("utf-8")

def construct_messages(image_paths: list[str], timestamps: list[float], prompt: str) -> list[dict]:
    """
    构造包含文本和图像的消息列表。
    参数:
        image_paths (list[str]): 图像文件路径列表。
        timestamps (list[float]): 视频的时间戳。
        prompt (str): 文本提示。
    返回:
        list[dict]: 包含文本和图像的消息列表。
    """
    # 初始化消息内容列表
    content = []
    # 遍历图像路径列表
    for idx, image_path in enumerate(image_paths):
        # 为每个图像路径构造一个图像URL消息
        if timestamps is not None:
            content.append({
                "type": "text",
                "text": f'[{timestamps[idx]} second]'
            })
        content.append(
            {
                "type": "image_url",
                "image_url": {
                    # 使用Base64编码将图像转换为数据URL
                    "url": f"data:image/jpeg;base64,{encode_image(image_path)}",
                    # 指定图像细节级别为低
                    "detail":"low"
                },
            }
        )
    content.append(
        {
            "type": "text",
            "text": prompt,
    })
    # 返回包含文本和图像的消息列表
    return [
        {
            "role": "user",
            "content": content,
        }
    ]

def api_complete(client, messages):

    response = client.chat.completions.create(
        model=seed_vl_version,
        messages=messages)
    return response.choices[0]

### 1. General Video Question-Answering

In [12]:
video_path = "samples/OcZeMOnLpTQ.mp4"
text_prompts = "Describe this video in details."
if os.path.exists("video_frames"):
    shutil.rmtree("video_frames")
# sampling video frames
sampling_fps = 1
max_frames = 30
selected_images, timestamps = preprocess_video(
    video_file_path=video_path,
    output_dir="video_frames",
    extraction_strategy=Strategy.CONSTANT_INTERVAL,
    interval_in_seconds=sampling_fps,
    use_timestamp=True,
    max_frames=max_frames
)
message = construct_messages(image_paths=selected_images, timestamps=timestamps, prompt=text_prompts)
result = api_complete(client, message)
print("Seed1.5-VL:", result.message.content)

抽取帧数: 24
Seed1.5-VL: The video showcases a female athlete competing in the **high jump** at an outdoor track - and - field event. Here is a detailed breakdown:  


### Scene Setting  
The event takes place on a blue - colored running track. In the background, there are brick buildings, grassy areas, spectators, and officials. Some spectators are standing behind barriers, while others are sitting or walking around. There are also sports equipment and supplies, such as Gatorade coolers and umbrellas for shade.  


### Athlete’s First Attempt  
1. **Preparation**: The athlete, dressed in a yellow and white track suit and bright green running shoes, gets ready on the track. She seems to be mentally preparing herself and visualizing the jump.  
2. **Run - up**: She starts her run - up, gradually picking up speed. Her strides become longer, and her arms swing in coordination with her body to build momentum.  
3. **Takeoff**: With one foot pushing off the ground powerfully, she leaps into the

### 2. Video Temporal Grounding
The Seed1.5-VL has powerful video temporal localization capabilities. Based on user prompts, it can locate the corresponding segments within the video and output them in the format of start and end seconds.

In [15]:
video_path = "samples/OcZeMOnLpTQ.mp4"
text_prompts = "请输出视频中女人每一次跳高的精确起止时间，如果有多次，请分别输出每一次的起止时间"
if os.path.exists("video_frames"):
    shutil.rmtree("video_frames")
# sampling video frames
sampling_fps = 1
max_frames = 30
selected_images, timestamps = preprocess_video(
    video_file_path=video_path,
    output_dir="video_frames",
    extraction_strategy=Strategy.CONSTANT_INTERVAL,
    interval_in_seconds=sampling_fps,
    use_timestamp=True,
    max_frames=max_frames
)
message = construct_messages(image_paths=selected_images, timestamps=timestamps, prompt=text_prompts)
result = api_complete(client, message)
print("Seed1.5-VL:", result.message.content)

抽取帧数: 24
Seed1.5-VL: 第一次跳高：3 - 10秒；第二次跳高：12 - 21秒


### 3. Dense Video Captioning
Building upon its temporal grounding capabilities, the model exhibits time perception ability.

In [17]:
video_path = "samples/OcZeMOnLpTQ.mp4"
text_prompts = "Please watch this video carefully and find out all key events in this video, and output the events along with the start/end timestamps."
if os.path.exists("video_frames"):
    shutil.rmtree("video_frames")
# sampling video frames
sampling_fps = 1
max_frames = 30
selected_images, timestamps = preprocess_video(
    video_file_path=video_path,
    output_dir="video_frames",
    extraction_strategy=Strategy.CONSTANT_INTERVAL,
    interval_in_seconds=sampling_fps,
    use_timestamp=True,
    max_frames=max_frames
)
message = construct_messages(image_paths=selected_images, timestamps=timestamps, prompt=text_prompts)
result = api_complete(client, message)
print("Seed1.5-VL:", result.message.content)

抽取帧数: 24
Seed1.5-VL: 1. 第一次助跑加速：0 - 5秒，女子在跑道上准备后开始助跑，快速冲向跳高架。  
2. 第一次起跳过杆：5 - 10秒，女子起跳腾空，身体越过横杆后落在垫上。  
3. 第一次落地后起身：10 - 13秒，女子在跳高垫上起身并走下垫子。  
4. 第二次助跑加速：13 - 18秒，女子回到跑道再次助跑，向跳高架加速冲刺。  
5. 第二次起跳过杆：18 - 22秒，女子起跳腾空越过横杆，落地后在垫上起身。
