### Video Understanding with Qwen2.5-VL

In this notebook, we delve into the capabilities of the **Qwen2.5-VL** model for video understanding tasks. Our objective is to showcase how this advanced model can be applied to various video analysis scenarios, ranging from basic OCR to complex event detection and summarization.


#### \[Setup\]

We start by loading the pre-trained `Qwen2_5_VLForConditionalGeneration` model. This model has been fine-tuned on a diverse set of video understanding tasks, enabling it to generate detailed and accurate descriptions based on visual inputs.


In [1]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

model_path = "Qwen/Qwen2.5-VL-7B-Instruct"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_path)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


In [2]:
# check model is already on cuda
current_device = next(model.parameters()).device
target_device = torch.device('cuda')

if current_device != target_device:
    print("Model is not on cuda, moving to cuda")
    model = model.to(target_device)

Model is not on cuda, moving to cuda


Load video frames and timestamps

In [3]:
# !pip install decord

In [26]:
import hashlib
import torch
import os
import numpy as np
from qwen_vl_utils import process_vision_info

In [57]:
import os
import math
import hashlib
import requests

from IPython.display import Markdown, display
import numpy as np
from PIL import Image
import decord
from decord import VideoReader, cpu

def convert_gdrive_url_to_download_url(gdrive_url):
    # Extract the file ID from the Gdrive URL
    file_id = gdrive_url.split('/')[-2]
    # Create the download URL
    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
    return download_url

def download_video(url, dest_path):
    response = requests.get(url, stream=True)
    with open(dest_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8096):
            f.write(chunk)
    print(f"Video downloaded to {dest_path}")


def get_video_frames(video_path, num_frames=128, cache_dir='.cache'):
    os.makedirs(cache_dir, exist_ok=True)

    video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
    if video_path.startswith('http://') or video_path.startswith('https://'):
        video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
        if not os.path.exists(video_file_path):
            download_video(video_path, video_file_path)
    else:
        video_file_path = video_path

    frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
    timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')

    if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
        frames = np.load(frames_cache_file)
        timestamps = np.load(timestamps_cache_file)
        return video_file_path, frames, timestamps

    vr = VideoReader(video_file_path, ctx=cpu(0))
    total_frames = len(vr)

    indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
    frames = vr.get_batch(indices).asnumpy()
    timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])

    np.save(frames_cache_file, frames)
    np.save(timestamps_cache_file, timestamps)
    
    return video_file_path, frames, timestamps


def create_image_grid(images, num_columns=8):
    pil_images = [Image.fromarray(image) for image in images]
    num_rows = math.ceil(len(images) / num_columns)

    img_width, img_height = pil_images[0].size
    grid_width = num_columns * img_width
    grid_height = num_rows * img_height
    grid_image = Image.new('RGB', (grid_width, grid_height))

    for idx, image in enumerate(pil_images):
        row_idx = idx // num_columns
        col_idx = idx % num_columns
        position = (col_idx * img_width, row_idx * img_height)
        grid_image.paste(image, position)

    return grid_image



def get_video_embeddings(video_path, model, processor, num_frames=128, cache_dir='.cache',
                        total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28, 
                        original_fps=None, max_frames=30, include_mapping=True):
    """
    Extended version that caches both frames and embeddings with FPS and timestamp mapping
    
    Args:
        video_path: Path to video file
        model: Qwen model
        processor: Qwen processor
        num_frames: Number of frames to extract (will be limited by max_frames if include_mapping=True)
        cache_dir: Directory for caching
        total_pixels: Total pixels for processing
        min_pixels: Minimum pixels for processing
        original_fps: Original video FPS (auto-detected if None)
        max_frames: Maximum frames for model input (used when include_mapping=True)
        include_mapping: Whether to include timestamp mapping functionality
    
    Returns:
        frames: Video frames
        timestamps: Frame timestamps
        embeddings_data: Embeddings with additional metadata
        mapping_info: Timestamp mapping information (if include_mapping=True)
    """
    
    os.makedirs(cache_dir, exist_ok=True)
    
    # Auto-detect original FPS if not provided
    if original_fps is None:
        cap = cv2.VideoCapture(video_path)
        original_fps = cap.get(cv2.CAP_PROP_FPS) or 12.0  # Default fallback
        cap.release()
        print(f"Auto-detected FPS: {original_fps}")
    
    # Adjust num_frames based on mapping requirements
    effective_num_frames = min(num_frames, max_frames) if include_mapping else num_frames
    
    # Create hash including all parameters including FPS
    cache_params = f"{video_path}_{effective_num_frames}_{total_pixels}_{min_pixels}_{original_fps}_{include_mapping}"
    video_hash = hashlib.md5(cache_params.encode('utf-8')).hexdigest()
    
    # Cache files
    frames_cache_file = os.path.join(cache_dir, f'{video_hash}_frames.npy')
    timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_timestamps.npy')
    embeddings_cache_file = os.path.join(cache_dir, f'{video_hash}_embeddings.pt')
    mapping_cache_file = os.path.join(cache_dir, f'{video_hash}_mapping.json')
    
    # Check if all cached files exist
    cache_exists = (os.path.exists(frames_cache_file) and 
                   os.path.exists(timestamps_cache_file) and 
                   os.path.exists(embeddings_cache_file))
    
    if include_mapping:
        cache_exists = cache_exists and os.path.exists(mapping_cache_file)
    
    if cache_exists:
        print(f"Loading cached video embeddings from {embeddings_cache_file}")
        frames = np.load(frames_cache_file)
        timestamps = np.load(timestamps_cache_file)
        embeddings_data = torch.load(embeddings_cache_file)
        
        if include_mapping:
            import json
            with open(mapping_cache_file, 'r') as f:
                mapping_info = json.load(f)
            return frames, timestamps, embeddings_data, mapping_info
        else:
            return frames, timestamps, embeddings_data
    
    # Get video properties for mapping
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    detected_fps = cap.get(cv2.CAP_PROP_FPS) or original_fps
    duration = total_frames / detected_fps
    cap.release()
    
    print(f"Video info: {total_frames} frames, {detected_fps:.2f} FPS, {duration:.2f}s")
    
    # Create mapping info if requested
    mapping_info = None
    if include_mapping:
        # Calculate frame sampling for timestamp mapping
        if total_frames <= effective_num_frames:
            frame_indices = list(range(total_frames))
        else:
            frame_indices = np.linspace(0, total_frames - 1, effective_num_frames, dtype=int)
        
        mapping_info = {
            'original_fps': detected_fps,
            'original_total_frames': total_frames,
            'original_duration': duration,
            'sampled_frames': len(frame_indices),
            'effective_num_frames': effective_num_frames,
            'time_scale_factor': duration / (len(frame_indices) / original_fps) if len(frame_indices) > 1 else 1.0,
            'frame_mapping': {}
        }
        
        # Build frame mapping
        for model_idx, original_frame_idx in enumerate(frame_indices):
            original_timestamp = original_frame_idx / detected_fps
            mapping_info['frame_mapping'][model_idx] = {
                'original_frame_idx': int(original_frame_idx),
                'original_timestamp': original_timestamp,
                'timestamp_formatted': f"{original_timestamp:.2f}s"
            }
        
        print(f"Time scale factor: {mapping_info['time_scale_factor']:.2f}")
    
    # Get video frames with updated frame count
    video_file_path, frames, timestamps = get_video_frames(video_path, effective_num_frames, cache_dir)
    
    # Create messages with FPS information
    messages = [{
        "role": "user",
        "content": [{
            "type": "video", 
            "video": video_path,
            "total_pixels": total_pixels,
            "min_pixels": min_pixels,
            "fps": original_fps  # Pass original FPS to model
        }]
    }]
    
    # Process video for model input
    image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
    
    # Update video_kwargs with our FPS information
    if 'fps' not in video_kwargs or not video_kwargs['fps']:
        video_kwargs['fps'] = [original_fps]
    
    inputs = processor(
        text=[""],  # Dummy text
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    )
    inputs = inputs.to(model.device)
    
    # Extract embeddings using the visual encoder
    with torch.no_grad():
        # Method 1: Use the correct parameter names for Qwen2.5-VL
        if hasattr(inputs, 'pixel_values_videos') and inputs.pixel_values_videos is not None:
            # For video inputs
            visual_outputs = model.visual(
                inputs.pixel_values_videos,
                grid_thw=inputs.get('video_grid_thw')
            )
        elif hasattr(inputs, 'pixel_values') and inputs.pixel_values is not None:
            # For image inputs
            visual_outputs = model.visual(inputs.pixel_values)
        else:
            raise ValueError("No valid pixel values found in inputs")
        
        # Extract the embeddings
        if hasattr(visual_outputs, 'last_hidden_state'):
            vision_embeddings = visual_outputs.last_hidden_state
        else:
            # If direct access doesn't work, visual_outputs might be the embeddings directly
            vision_embeddings = visual_outputs
    
    # Enhanced embeddings data with FPS and mapping info
    embeddings_data = {
        'vision_embeddings': vision_embeddings.cpu(),
        'video_grid_thw': inputs.get('video_grid_thw'),
        'fps': video_kwargs.get('fps', [original_fps]),
        'original_fps': original_fps,
        'detected_fps': detected_fps,
        'num_frames': video_inputs[0].shape[0] if video_inputs else 0,
        'effective_num_frames': effective_num_frames,
        'total_frames': total_frames,
        'duration': duration,
        'include_mapping': include_mapping
    }
    
    # Add timestamp conversion utilities to embeddings_data
    if include_mapping and mapping_info:
        embeddings_data.update({
            'time_scale_factor': mapping_info['time_scale_factor'],
            'timestamp_converter': {
                'model_to_original_time': lambda t: t * mapping_info['time_scale_factor'],
                'model_to_original_range': lambda start, end: (
                    start * mapping_info['time_scale_factor'],
                    end * mapping_info['time_scale_factor']
                ),
                'scale_factor': mapping_info['time_scale_factor']
            }
        })
    
    # Cache the embeddings and mapping
    torch.save(embeddings_data, embeddings_cache_file)
    np.save(frames_cache_file, frames)
    np.save(timestamps_cache_file, timestamps)
    
    if include_mapping and mapping_info:
        import json
        with open(mapping_cache_file, 'w') as f:
            json.dump(mapping_info, f, indent=2)
        print(f"Cached mapping info to {mapping_cache_file}")
    
    print(f"Cached video embeddings to {embeddings_cache_file}")
    
    if include_mapping:
        return frames, timestamps, embeddings_data, mapping_info
    else:
        return frames, timestamps, embeddings_data


def perform_video_grounding_with_cached_embeddings(
    prompt: str, 
    model, 
    processor, 
    embeddings_data: Dict,
    device: str = None
) -> str:
    """
    Perform video grounding using pre-computed cached embeddings directly.
    This bypasses video re-processing and uses the cached visual features.
    
    Args:
        prompt: Text query for video grounding
        model: The Qwen2.5-VL model
        processor: The processor/tokenizer
        embeddings_data: Dict containing cached embeddings and metadata
        device: Device to run inference on
        
    Returns:
        Generated response text
    """
    
    print(f"=== Video Grounding with Cached Embeddings ===")
    print(f"Query: {prompt}")
    print("-" * 60)
    
    if device is None:
        device = model.device
    
    # Extract cached embeddings
    vision_embeddings = embeddings_data['vision_embeddings'].to(device)
    video_grid_thw = embeddings_data.get('video_grid_thw')
    if video_grid_thw is not None:
        video_grid_thw = video_grid_thw.to(device)
    
    print(f"✅ Using cached embeddings:")
    print(f"   Shape: {vision_embeddings.shape}")
    print(f"   Frames: {embeddings_data.get('num_frames', 'N/A')}")
    print(f"   Device: {vision_embeddings.device}")
    
    # Tokenize the text prompt
    messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Tokenize text only (no video processing)
    text_inputs = processor.tokenizer(
        text, 
        return_tensors="pt", 
        padding=True,
        truncation=True
    ).to(device)
    
    print(f"✅ Text tokenized:")
    print(f"   Input IDs shape: {text_inputs.input_ids.shape}")
    print(f"   Text length: {len(text)}")
    
    # Prepare inputs for the model
    # We need to manually construct the input embeddings
    with torch.no_grad():
        # Get text embeddings
        text_embeddings = model.get_input_embeddings()(text_inputs.input_ids)
        
        # Combine text and vision embeddings
        # Vision embeddings typically go first, then text
        if vision_embeddings.dim() == 2:
            vision_embeddings = vision_embeddings.unsqueeze(0)  # Add batch dimension
        
        if text_embeddings.dim() == 3 and vision_embeddings.dim() == 3:
            # Concatenate along sequence dimension
            combined_embeddings = torch.cat([vision_embeddings, text_embeddings], dim=1)
        else:
            print(f"⚠️  Dimension mismatch - Vision: {vision_embeddings.shape}, Text: {text_embeddings.shape}")
            combined_embeddings = text_embeddings
        
        print(f"✅ Combined embeddings shape: {combined_embeddings.shape}")
        
        # Create attention mask for combined embeddings
        vision_attention = torch.ones(vision_embeddings.shape[:2], device=device, dtype=torch.long)
        text_attention = text_inputs.attention_mask
        combined_attention_mask = torch.cat([vision_attention, text_attention], dim=1)
        
        # Prepare generation inputs
        generation_inputs = {
            'inputs_embeds': combined_embeddings,
            'attention_mask': combined_attention_mask,
        }
        
        # Add video-specific inputs if available
        if video_grid_thw is not None:
            generation_inputs['video_grid_thw'] = video_grid_thw
        
        print("🤖 Generating response with cached embeddings...")
        
        # Generate response
        generated_ids = model.generate(
            **generation_inputs,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=processor.tokenizer.eos_token_id,
            eos_token_id=processor.tokenizer.eos_token_id
        )
        
        # Decode only the generated part (skip the input embeddings part)
        # Since we used inputs_embeds, we need to decode all generated tokens
        response = processor.tokenizer.batch_decode(
            generated_ids,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )[0]
        
        # Clean up the response (remove the input prompt if it appears)
        if text in response:
            response = response.replace(text, "").strip()
        
        print(f"✅ Response generated:")
        print(f"📝 Model Response: {response}")
        
        return response




In [54]:
# Test FPS alone - no timestamp mapping
video_path = "/home/ubuntu/lilynogh/Qwen2.5-VL/cookbooks/downloads/downloaded_video.mp4"

frames, timestamps, embeddings_data = get_video_embeddings(
    video_path=video_path,
    model=model,
    processor=processor,
    original_fps=12.0,        # Pass your FPS
    include_mapping=False,    # KEY: Disable mapping
    num_frames=190           # Use original frame count
)

print(f"FPS passed to model: {embeddings_data['original_fps']}")

Video info: 189 frames, 12.00 FPS, 15.75s


Unused or unrecognized kwargs: return_tensors, fps.


Cached video embeddings to .cache/388b5a2524b105b3905d21b332243859_embeddings.pt
FPS passed to model: 12.0


In [None]:
embeddings_data = torch.load('.cache/388b5a2524b105b3905d21b332243859_embeddings.pt')

print(f"Loaded embeddings:")
print(f"  Shape: {embeddings_data['vision_embeddings'].shape}")
print(f"  Frames: {embeddings_data['num_frames']}")
print(f"  FPS: {embeddings_data['fps']}")


Loaded embeddings:
  Shape: torch.Size([19646, 3584])
  Frames: 188
  FPS: [11.936507936507937]


In [59]:
# Test lane change detection with 188 frames
video_path = "/home/ubuntu/lilynogh/Qwen2.5-VL/cookbooks/downloads/downloaded_video.mp4"
prompt = "Give the query: 'other vehicle did lane changes into the lane in front me and I have to deaccelerate', when does the described content occur in the video?"

# Use the standard perform_video_grounding function (the one that re-processes video)
# but it will use the same parameters that created your 188-frame embeddings
messages = [{
    "role": "user",
    "content": [
        {
            "type": "video", 
            "video": video_path,
            "total_pixels": 20480 * 28 * 28,  # Your parameters
            "min_pixels": 16 * 28 * 28
        },
        {
            "type": "text", 
            "text": prompt
        }
    ]
}]

# Process and generate
from qwen_vl_utils import process_vision_info

text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)

inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt"
)
inputs = inputs.to(model.device)

print(f"Video processing verification:")
print(f"  Processed frames: {video_inputs[0].shape[0] if video_inputs else 'None'}")

with torch.no_grad():
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
    
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    response = processor.batch_decode(
        generated_ids_trimmed, 
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=False
    )[0]

print(f"188-frame response: {response}")

Unused or unrecognized kwargs: return_tensors, fps.


Video processing verification:
  Processed frames: 30
188-frame response: The event you described, where "other vehicle did lane changes into the lane in front me and I have to deaccelerate," occurs around the middle of the video, specifically between 3.0 seconds and 5.0 seconds. At this point, a black SUV can be seen making a lane change, which causes the vehicle from whose perspective the video is shot to slow down or decelerate.


In [58]:
embeddings_data = torch.load('.cache/388b5a2524b105b3905d21b332243859_embeddings.pt')
prompt = "Give the query: 'other vehicle did lane changes into the lane in front me and I have to deaccelerate', when does the described content occur in the video?"

# Use cached embeddings directly
response = perform_video_grounding_with_cached_embeddings(
    prompt=prompt,
    model=model,
    processor=processor,
    embeddings_data=embeddings_data
)
print(f"Result: {response}")

=== Video Grounding with Cached Embeddings ===
Query: Give the query: 'other vehicle did lane changes into the lane in front me and I have to deaccelerate', when does the described content occur in the video?
------------------------------------------------------------
✅ Using cached embeddings:
   Shape: torch.Size([19646, 3584])
   Frames: 188
   Device: cuda:0
✅ Text tokenized:
   Input IDs shape: torch.Size([1, 53])
   Text length: 262
✅ Combined embeddings shape: torch.Size([1, 19699, 3584])
🤖 Generating response with cached embeddings...
✅ Response generated:
📝 Model Response: The event where the described content occurs is when the other vehicle makes a left turn in front of you.
Result: The event where the described content occurs is when the other vehicle makes a left turn in front of you.


In [None]:
video_path = "/home/ubuntu/lilynogh/Qwen2.5-VL/cookbooks/downloads/downloaded_video.mp4"
your_prompt = "Give the query: 'other vehicle did lane changes into the lane in front me and I have to deaccelerate', when does the described content occur in the video?"

embeddings_data = torch.load('.cache/388b5a2524b105b3905d21b332243859_embeddings.pt')
response = perform_video_grounding(video_path, your_prompt, model, processor, embeddings_data)
display(Markdown(response))

In [33]:
video_path = "/home/ubuntu/lilynogh/Qwen2.5-VL/cookbooks/downloads/downloaded_video.mp4"
your_prompt = "Give the query: 'other vehicle did lane changes into the lane in front me and I have to deaccelerate', when does the described content occur in the video?"

embeddings_data = torch.load('.cache/d9e492fa1ea59dc77cb44cf272647efa_embeddings.pt')
response = perform_video_grounding(video_path, your_prompt, model, processor, embeddings_data)
display(Markdown(response))

=== Video Grounding Validation ===
Query: Give the query: 'other vehicle did lane changes into the lane in front me and I have to deaccelerate', when does the described content occur in the video?
--------------------------------------------------


Unused or unrecognized kwargs: return_tensors, fps.


Model Response: The event you're describing, where another vehicle makes a lane change into your lane and you need to decelerate, occurs around 0.0 - 5.0 seconds in the video. Specifically, a black SUV can be seen making a lane change into the lane in front of the camera, causing the camera's perspective to move forward as it follows the traffic flow.


The event you're describing, where another vehicle makes a lane change into your lane and you need to decelerate, occurs around 0.0 - 5.0 seconds in the video. Specifically, a black SUV can be seen making a lane change into the lane in front of the camera, causing the camera's perspective to move forward as it follows the traffic flow.

In [30]:
video_path = "/home/ubuntu/lilynogh/Qwen2.5-VL/cookbooks/downloads/downloaded_video.mp4"

frames, timestamps, embeddings_data = get_video_embeddings(
    video_path=video_path,
    model=model,
    processor=processor,
    num_frames=160,
    cache_dir='.cache'
)

print(f"Video embeddings shape: {embeddings_data['vision_embeddings'].shape}")
print(f"Number of frames: {embeddings_data['num_frames']}")
print(f"FPS: {embeddings_data['fps']}")

Unused or unrecognized kwargs: return_tensors, fps.


Cached video embeddings to .cache/d9e492fa1ea59dc77cb44cf272647efa_embeddings.pt
Video embeddings shape: torch.Size([10800, 3584])
Number of frames: 30
FPS: [1.9047619047619047]


In [None]:
# long video understanding
# prompt = "Could you go into detail about the content of this driving video?"

# video summarization
# prompt = "Could you use a table to summarize the interesting vehicle driving behaviors in this driving video?"

# video grounding
prompt = "Give the query: 'other vehicle did lane changes into the lane in front me and I have to deaccelerate', when does the described content occur in the video?"


response = inference("/home/ubuntu/lilynogh/Qwen2.5-VL/cookbooks/downloads/downloaded_video.mp4", prompt)
display(Markdown(response))

Unused or unrecognized kwargs: return_tensors, fps.


video input: torch.Size([30, 3, 560, 1008])
num of video tokens: 10800


The described content, "other vehicle did lane changes into the lane in front me and I have to deaccelerate," occurs around 0:04-0:12 in the video. At this point, a black car is seen making a lane change directly in front of the camera's perspective, causing the vehicle from which the footage is being recorded to slow down or deaccelerate.

Inference function

In [4]:
def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
            ]
        },
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
    fps_inputs = video_kwargs['fps']
    print("video input:", video_inputs[0].shape)
    num_frames, _, resized_height, resized_width = video_inputs[0].shape
    print("num of video tokens:", int(num_frames / 2 * resized_height / 28 * resized_width / 28))
    inputs = processor(text=[text], images=image_inputs, videos=video_inputs, fps=fps_inputs, padding=True, return_tensors="pt")
    inputs = inputs.to('cuda')

    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return output_text[0]

Inference function with API using OpenAI SDK.

**Important Notice:**
- Please be aware that the current API supports video processing up to a maximum length of 10 minutes.
- Currently, the model inference interface does not support configuring the resolution of video frames. Therefore, it is recommended to resize videos with higher resolutions and longer durations to a smaller resolution to ensure that the input sequence is not excessively long. We advise keeping the number of video tokens under 24k to achieve better video grounding results.

In [7]:
# import os
# from openai import OpenAI
# from IPython.display import Markdown, display


# def inference_with_api(
#     video_path,
#     prompt,
#     sys_prompt = "You are a helpful assistant.",
#     model_id = "qwen-vl-max-latest",
# ):
#     client = OpenAI(
#         api_key = os.getenv('DASHSCOPE_API_KEY'),
#         base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1",
#     )    
#     messages = [
#         {
#             "role": "system",
#             "content": [{"type":"text","text": sys_prompt}]
#         },
#         {
#             "role": "user",
#             "content": [
#                 {"type": "video_url", "video_url": {"url": video_path}},
#                 {"type": "text", "text": prompt},
#             ]
#         }
#     ]
#     completion = client.chat.completions.create(
#         model = model_id,
#         messages = messages,
#     )
#     print(completion)
#     return completion.choices[0].message.content

#### 1. Reading Text in Videos

In this section, we demonstrate how the model can be used to recognize and summarize text within a video. Specifically, we'll use a video containing various products and ask the model to summarize their characteristics in a structured format.


In [17]:
# !pip uninstall transformers
# !pip install transformers==4.52
!pip install gdown


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.3/187.3 KB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting soupsieve>1.2
  Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Collecting PySocks!=1.5.7,>=1.5.6
  Downloading PySocks-1.7.1-py3-none-any.whl (16 kB)
Installing collected packages: soupsieve, PySocks, beautifulsoup4, gdown
Successfully installed PySocks-1.7.1 beautifulsoup4-4.13.4 gdown-5.2.0 soupsieve-2.7


In [18]:
import gdown
# Your Google Drive URL
url = "https://drive.google.com/file/d/1xpBUYOO9vjr0eQiDAYlPhG6UC6CLyukK/view?usp=sharing"

# Extract the file ID from the URL
file_id = "1xpBUYOO9vjr0eQiDAYlPhG6UC6CLyukK"

# Create local directory if it doesn't exist
local_dir = "./downloads"
os.makedirs(local_dir, exist_ok=True)

# Download the file
output_path = os.path.join(local_dir, "downloaded_video.mp4")
gdown.download(f"https://drive.google.com/uc?id={file_id}", output_path, quiet=False)

print(f"File downloaded to: {output_path}")

Downloading...
From: https://drive.google.com/uc?id=1xpBUYOO9vjr0eQiDAYlPhG6UC6CLyukK
To: /lambda/nfs/lilynogh/Qwen2.5-VL/cookbooks/downloads/downloaded_video.mp4
100%|██████████| 10.7M/10.7M [00:00<00:00, 33.2MB/s]

File downloaded to: ./downloads/downloaded_video.mp4





In [15]:
# url = "https://drive.google.com/uc?export=download&id=1xpBUYOO9vjr0eQiDAYlPhG6UC6CLyukK"
# download_dir = quick_download(url)
# print(f"Downloaded to directory: {download_dir}")

Downloaded to directory: .cache


In [None]:
# long video understanding
# prompt = "Could you go into detail about the content of this driving video?"

# video summarization
# prompt = "Could you use a table to summarize the interesting vehicle driving behaviors in this driving video?"

# video grounding
prompt = "Give the query: 'other vehicle did lane changes into the lane in front me and I have to deaccelerate', when does the described content occur in the video?"


response = inference("/home/ubuntu/lilynogh/Qwen2.5-VL/cookbooks/downloads/downloaded_video.mp4", prompt)
display(Markdown(response))

Unused or unrecognized kwargs: return_tensors, fps.


video input: torch.Size([30, 3, 560, 1008])
num of video tokens: 10800


The described content, "other vehicle did lane changes into the lane in front me and I have to deaccelerate," occurs around 0:04-0:12 in the video. At this point, a black car is seen making a lane change directly in front of the camera's perspective, causing the vehicle from which the footage is being recorded to slow down or deaccelerate.

In [22]:
video_path, frames, timestamps = get_video_frames("/home/ubuntu/lilynogh/Qwen2.5-VL/cookbooks/downloads/downloaded_video.mp4", num_frames=192)

In [5]:
video_url = "https://duguang-labelling.oss-cn-shanghai.aliyuncs.com/qiansun/video_ocr/videos/50221078283.mp4"
prompt = "请用表格总结一下视频中的商品特点"

## Use a local HuggingFace model to inference.
video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
# image_grid = create_image_grid(frames, num_columns=8)
# display(image_grid.resize((640, 640)))

response = inference(video_path, prompt)
display(Markdown(response))

qwen-vl-utils using decord to read video.
Unused or unrecognized kwargs: return_tensors, fps.


video input: torch.Size([58, 3, 532, 980])
num of video tokens: 19285


以下是根据视频内容总结的商品特点表格：

| 特点 | 描述 |
|------|------|
| 适用范围广 | 可用于龙眼、切片西瓜、圣女果、樱桃等多种水果包装。 |
| 捏扣设计 | 人性化设计，易扣不繁琐。 |
| 捏扣紧锁 | 上下盖紧锁，摇晃不脱落。 |
| 专业铝膜 | 采用PET材料制作，做工精细。 |
| 防压抗摔 | 耐压耐磨，耐低温，可冷藏。 |
| 美观实用 | 纹理清晰质感佳，形状好，光泽度好。 |
| 高透加厚 | 盒内产品一目了然，无色无味。 |
| 全面展示 | 全面展示产品细节。 |

希望这个表格能帮助你更好地理解视频中商品的特点！

In [None]:
## Use API for inference. Apply API key here: https://bailian.console.alibabacloud.com/?apiKey=1
# os.environ['DASHSCOPE_API_KEY'] = 'your_api_key_here' 

video_url = "http://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/cookbook/video_ocr.mp4"
prompt = "Watch the video and list the paper titles in a table, add one extra column for translating the paper titles to Chinese."

response = inference_with_api(video_url, prompt)
display(Markdown(response))

#### 2. Long Video Understanding

Next, we explore the model's capability to comprehend extremely long videos, such as those lasting up to one hour. This demonstrates how the model can effectively process and analyze extended video content, extracting meaningful insights over longer durations.

To reduce the number of visual tokens generated from a long video, you can specify the `resized_height` and `resized_width` parameters. These settings allow the video frames to be resized to a smaller dimension, effectively decreasing the computational load while maintaining the essential visual information needed for analysis.


In [None]:
video_url = "https://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/evaluations/data/LVBench/videos/GcRKREorGSc.mp4"
prompt = "Could you go into detail about the content of this long video?"

video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
# image_grid = create_image_grid(frames, num_columns=8)
# display(image_grid.resize((640, 640)))

response = inference(video_path, prompt)
display(Markdown(response))


#### 3. Video Grounding

This part focuses on answering specific questions about a video segment. We specify a textual query and ask the model what is the period that the described content occur in the video, showcasing the model's ability to understand timestamps and search the detailed queries.

In [None]:
video_url = "https://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/datasets/cookbook/ead2e3f0e7f836c9ec51236befdaf2d843ac13a6.mp4"
prompt = "Give the query: 'seasoning the steak', when does the described content occur in the video?"

video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
image_grid = create_image_grid(frames, num_columns=8)
display(image_grid.resize((640, 640)))

# inference
response = inference(video_path, prompt)
display(Markdown(response))

In [None]:
## Use API for inference. Apply API key here: https://bailian.console.alibabacloud.com/?apiKey=1
# os.environ['DASHSCOPE_API_KEY'] = 'your_api_key_here' 

video_url = "http://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/cookbook/video_structured_caption_480p.mov"
prompt = "Give the query: 'The seasoned meat is placed on a grill', when does the described content occur in the video? Use ‘mm:ss.ff’ as time format."

response = inference_with_api(video_url, prompt)
display(Markdown(response))

#### 4. Structured Video Captioning

Finally, we present a scenario where the model identifies significant events within the video, providing start and end timestamps for each event along with descriptive sentences. The output is formatted in JSON for easy parsing and further processing.


In [20]:
video_url = "https://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/datasets/cookbook/ead2e3f0e7f836c9ec51236befdaf2d843ac13a6.mp4"
prompt = "Localize a series of activity events in the video, output the start and end timestamp for each event, and describe each event with sentences. Provide the result in json format with 'mm:ss.ff' format for time depiction."

video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
# image_grid = create_image_grid(frames, num_columns=8)
# display(image_grid.resize((640, 640)))

In [None]:
# inference
response = inference(video_path, prompt)
display(Markdown(response))

- By post-processing the json results, we can intuitively present video clips and descriptions in an interleaved manner.

In [22]:
import json
import markdown
from bs4 import BeautifulSoup
from datetime import datetime


def parse_json(response):
    html = markdown.markdown(response, extensions=['fenced_code'])
    soup = BeautifulSoup(html, 'html.parser')
    json_text = soup.find('code').text

    data = json.loads(json_text)
    return data


def time_to_seconds(time_str):
    time_obj = datetime.strptime(time_str, '%M:%S.%f')
    total_seconds = time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000
    return total_seconds


In [None]:
data = parse_json(response)

for item in data:
    start_time = item["start_time"]
    end_time = item["end_time"]
    description = item["description"]

    display(Markdown(f"**{start_time} - {end_time}:**\t\t" + description))

    start_time = time_to_seconds(start_time)
    end_time = time_to_seconds(end_time)
    current_frames = []
    for frame, timestamp in zip(frames, timestamps):
        if timestamp[0] > start_time and timestamp[1] < end_time:
            current_frames.append(frame)
    
    current_frames = np.array(current_frames)
    current_image_grid = create_image_grid(current_frames, num_columns=8)

    display(current_image_grid.resize((480, (int(len(current_frames) / 8) + 1) * 60)))


In [None]:
## Use API for inference. Apply API key here: https://bailian.console.alibabacloud.com/?apiKey=1
# os.environ['DASHSCOPE_API_KEY'] = 'your_api_key_here' 

video_url = "http://ofasys-multimodal-wlcb-3.oss-cn-wulanchabu.aliyuncs.com/sibo.ssb/cookbook/video_structured_caption_480p.mov"
prompt = "Localize a series of activity events in the video, output the start and end timestamp for each event, and describe each event with sentences. Provide the result in json format with ‘mm:ss.ff’ format for time depiction."

response = inference_with_api(video_url, prompt)
display(Markdown(response))