# Generating Video Highlights Using the SmolVLM2 Model

<img src="https://pyimagesearch.com/wp-content/uploads/2025/06/generating-video-highlights-using-the-smolvlm2-model-featured-v2.png" alt="Your image title" width=100% height=100%/>

----

### Conda env : [cv_playgrounds](../README.md#setup-a-conda-environment)

----

### Reference:

- ***Blogs***
    - [Generating Video Highlights Using the SmolVLM2 Model](https://pyimagesearch.com/2025/06/30/generating-video-highlights-using-the-smolvlm2-model/)
    - [SmolVLM2: Bringing Video Understanding to Every Device](https://huggingface.co/blog/smolvlm2)
    - https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct




## Configuring Your Development Environment

In [1]:
import torch

if torch.backends.mps.is_available():
    g_device = "mps"
elif torch.cuda.is_available():
    g_device = "cuda"
    !nvidia-smi
else:
    g_device = "cpu"

print(f"Available device : {g_device}")

Thu Sep 18 12:05:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.57.08              Driver Version: 575.57.08      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  |   00000000:01:00.0  On |                  N/A |
| 24%   45C    P5             38W /  250W |    1485MiB /  11264MiB |     33%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### Setup and Imports

In [2]:
import os
import json
import torch
import tempfile
import gradio as gr
import logging
import subprocess
from pathlib import Path
from transformers import AutoProcessor, AutoModelForImageTextToText

### Setup Logger

In [3]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

### Get Video Duration in Seconds

In [4]:
def get_video_duration_seconds(video_path: str) -> float:
   """Use ffprobe to get video duration in seconds."""
   cmd = [
       "ffprobe",
       "-v", "quiet",
       "-print_format", "json",
       "-show_format",
       video_path
   ]
   result = subprocess.run(cmd, capture_output=True, text=True)
   info = json.loads(result.stdout)
   return float(info["format"]["duration"])

### Load Model and Processor

In [5]:
def load_model_and_processor(model_path: str, device: str = "cuda", dtype=torch.bfloat16):
   processor = AutoProcessor.from_pretrained(model_path)
   model = AutoModelForImageTextToText.from_pretrained(
       model_path,
       torch_dtype=dtype,
    #    _attn_implementation="flash_attention_2"
   ).to(device)
   return processor, model

### Analyze Video Content

In [6]:
def analyze_video_content(processor, model, video_path: str, device: str = "cuda") -> str:
   system_message = "You are a helpful assistant that can understand videos. Describe what type of video this is and what's happening in it."
   messages = [
       {
           "role": "system",
           "content": [{"type": "text", "text": system_message}]
       },
       {
           "role": "user",
           "content": [
               {"type": "video", "path": video_path},
               {"type": "text", "text": "What type of video is this and what's happening in it? Be specific about the content type and general activities you observe."}
           ]
       }
   ]
   inputs = processor.apply_chat_template(
       messages,
       add_generation_prompt=True,
       tokenize=True,
       return_dict=True,
       return_tensors="pt"
   ).to(device, dtype=torch.bfloat16)
   outputs = model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.7)
   return processor.decode(outputs[0], skip_special_tokens=True).lower().split("assistant: ")[1]

### Determine Highlights

In [7]:
def determine_highlights(processor, model, video_description: str, prompt_num: int = 1, device: str = "cuda") -> str:
   system_prompts = {
       1: "You are a highlight editor. List archetypal dramatic moments that would make compelling highlights if they appear in the video. Each moment should be specific enough to be recognizable but generic enough to potentially exist in other videos of this type.",
       2: "You are a helpful visual-language assistant that can understand videos and edit. You are tasked with helping the user to create highlight reels for videos. Highlights should be rare and important events in the video in question."
   }
   user_prompts = {
       1: "List potential highlight moments to look for in this video:",
       2: "List dramatic moments that would make compelling highlights if they appear in the video. Each moment should be specific enough to be recognizable but generic enough to potentially exist in any video of this type:"
   }
   messages = [
       {
           "role": "system",
           "content": [{"type": "text", "text": system_prompts[prompt_num]}]
       },
       {
           "role": "user",
           "content": [{"type": "text", "text": f"""Here is a description of a video:\n\n{video_description}\n\n{user_prompts[prompt_num]}"""}]
       }
   ]
   print(f"Using prompt {prompt_num} for highlight detection")
   print(messages)
   inputs = processor.apply_chat_template(
       messages,
       add_generation_prompt=True,
       tokenize=True,
       return_dict=True,
       return_tensors="pt"
   ).to(device, dtype=torch.bfloat16)
   outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7)
   return processor.decode(outputs[0], skip_special_tokens=True).split("Assistant: ")[1]

### Process Video Segment

In [8]:
def process_segment(processor, model, video_path: str, highlight_types: str, device: str = "cuda") -> bool:
   messages = [
       {
           "role": "system",
           "content": [{"type": "text", "text": "You are a video highlight analyzer. Your role is to identify moments that have high dramatic value, focusing on displays of skill, emotion, personality, or tension. Compare video segments against provided example highlights to find moments with similar emotional impact and visual interest, even if the specific actions differ."}]
       },
       {
           "role": "user",
           "content": [
               {"type": "video", "path": video_path},
               {"type": "text", "text": f"""Given these highlight examples:\n{highlight_types}\n\nDoes this video contain a moment that matches the core action of one of the highlights? Answer with:\n'yes' or 'no'\nIf yes, justify it"""}]
       }
   ]
   print(messages)
   inputs = processor.apply_chat_template(
       messages,
       add_generation_prompt=True,
       tokenize=True,
       return_dict=True,
       return_tensors="pt"
   ).to(device, dtype=torch.bfloat16)
   outputs = model.generate(**inputs, max_new_tokens=64, do_sample=False)
   response = processor.decode(outputs[0], skip_special_tokens=True).lower().split("assistant: ")[1]
   print(f"Segment response {response}")
   return "yes" in response

### Concatenating Video Scenes into a Final Highlight Reel

In [None]:
def concatenate_scenes(video_path: str, scene_times: list, output_path: str):
    """Concatenate selected scenes into final video."""
    if not scene_times:
        logger.warning("No scenes to concatenate, skipping.")
        return
    filter_complex_parts = []
    concat_inputs = []
    for i, (start_sec, end_sec) in enumerate(scene_times):
        filter_complex_parts.append(
            f"[0:v]trim=start={start_sec}:end={end_sec},"
            f"setpts=PTS-STARTPTS[v{i}];"
        )
        filter_complex_parts.append(
            f"[0:a]atrim=start={start_sec}:end={end_sec},"
            f"asetpts=PTS-STARTPTS[a{i}];"
        )
        concat_inputs.append(f"[v{i}][a{i}]")
    concat_filter = f"{''.join(concat_inputs)}concat=n={len(scene_times)}:v=1:a=1[outv][outa]"
    filter_complex = "".join(filter_complex_parts) + concat_filter
    cmd = [
         "ffmpeg",
         "-y",
         "-i", video_path,
         "-filter_complex", filter_complex,
         "-map", "[outv]",
         "-map", "[outa]",
         "-c:v", "libx264",
         "-c:a", "aac",
         output_path
     ]
    logger.info(f"Running ffmpeg command: {' '.join(cmd)}")
    subprocess.run(cmd, check=True)


### Interface Logic

In [10]:
def create_ui(model_path: str):
    with gr.Blocks() as app:
        gr.Markdown("## Generate Video Highlights Using SmolVLM2 Model")
        with gr.Row():
            with gr.Column(scale=1):
                input_video = gr.Video(
                    label="Upload your video (max 30 minutes)",
                    interactive=True
                )
                process_btn = gr.Button("Process Video", variant="primary")
            with gr.Column(scale=1):
                output_video = gr.Video(
                    label="Highlight Video",
                    visible=False,
                    interactive=False,
                )
                status = gr.Markdown()
                analysis_accordion = gr.Accordion(
                    "Chain of thought details",
                    open=True,
                    visible=False
                )
                with analysis_accordion:
                    video_description = gr.Markdown("", elem_id="video_desc")
                    highlight_types = gr.Markdown("", elem_id="highlight_types")
        def on_process(video):
            # Clear all components when starting new processing
            yield [
                "",  # Clear status
                "",  # Clear video description
                "",  # Clear highlight types
                gr.update(value=None, visible=False),  # Clear video
                gr.update(visible=False)  # Hide accordion
            ]
            if not video:
                yield [
                    "Please upload a video",
                    "",
                    "",
                    gr.update(visible=False),
                    gr.update(visible=False)
                ]
                return
            try:
                duration = get_video_duration_seconds(video)
                if duration > 1800:  # 30 minutes
                    yield [
                        "Video must be shorter than 30 minutes",
                        "",
                        "",
                        gr.update(visible=False),
                        gr.update(visible=False)
                    ]
                    return
                yield [
                    "Initializing video highlight detector...",
                    "",
                    "",
                    gr.update(visible=False),
                    gr.update(visible=False)
                ]
                processor, model = load_model_and_processor(model_path)
                yield [
                    "Analyzing video content...",
                    "",
                    "",
                    gr.update(visible=False),
                    gr.update(visible=True)
                ]
                video_desc = analyze_video_content(processor, model, video)
                formatted_desc = f"### Summary:\n {video_desc[:500] + '...' if len(video_desc) > 500 else video_desc}"
                yield [
                    "Determining highlight types (2 variations)...",
                    formatted_desc,
                    "",
                    gr.update(visible=False),
                    gr.update(visible=True)
                ]
                # Get two different sets of highlights
                highlights1 = determine_highlights(processor, model, video_desc, prompt_num=1)
                highlights2 = determine_highlights(processor, model, video_desc, prompt_num=2)
                formatted_highlights = f"### Highlights to search for:\nSet 1:\n{highlights1[:500] + '...' if len(highlights1) > 500 else highlights1}\n\nSet 2:\n{highlights2[:500] + '...' if len(highlights2) > 500 else highlights2}"
                # Split video into segments
                temp_dir = "temp_segments"
                os.makedirs(temp_dir, exist_ok=True)
                segment_length = 10.0
                duration = get_video_duration_seconds(video)
                kept_segments1 = []
                kept_segments2 = []
                segments_processed = 0
                total_segments = int(duration / segment_length)
                for start_time in range(0, int(duration), int(segment_length)):
                    progress = int((segments_processed / total_segments) * 100)
                    yield [
                        f"Processing segments... {progress}% complete",
                        formatted_desc,
                        formatted_highlights,
                        gr.update(visible=False),
                        gr.update(visible=True)
                    ]
                    # Create segment
                    segment_path = f"{temp_dir}/segment_{start_time}.mp4"
                    end_time = min(start_time + segment_length, duration)
                    cmd = [
                        "ffmpeg",
                        "-y",
                        "-i", video,
                        "-ss", str(start_time),
                        "-t", str(segment_length),
                        "-c:v", "libx264",
                        "-preset", "ultrafast",  # Use ultrafast preset for speed
                        "-pix_fmt", "yuv420p",   # Ensure compatible pixel format
                        segment_path
                    ]
                    subprocess.run(cmd, check=True)
                    # Process segment with both highlight sets
                    if process_segment(processor, model, segment_path, highlights1):
                        print("KEEPING SEGMENT FOR SET 1")
                        kept_segments1.append((start_time, end_time))
                    if process_segment(processor, model, segment_path, highlights2):
                        print("KEEPING SEGMENT FOR SET 2")
                        kept_segments2.append((start_time, end_time))
                    # Clean up segment file
                    os.remove(segment_path)
                    segments_processed += 1
                # Remove temp directory
                os.rmdir(temp_dir)
                # Calculate percentages of video kept for each highlight set
                total_duration = duration
                duration1 = sum(end - start for start, end in kept_segments1)
                duration2 = sum(end - start for start, end in kept_segments2)
                percent1 = (duration1 / total_duration) * 100
                percent2 = (duration2 / total_duration) * 100
                print(f"Highlight set 1: {percent1:.1f}% of video")
                print(f"Highlight set 2: {percent2:.1f}% of video")
                # Choose the set with lower percentage unless it's zero
                final_segments = kept_segments2 if (0 < percent2 <= percent1 or percent1 == 0) else kept_segments1
                # Create final video
                if final_segments:
                    with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_file:
                        temp_output = tmp_file.name
                        concatenate_scenes(video, final_segments, temp_output)
                    selected_set = "2" if final_segments == kept_segments2 else "1"
                    percent_used = percent2 if final_segments == kept_segments2 else percent1
                    completion_message = f"Processing complete! Used highlight set {selected_set} ({percent_used:.1f}% of video)"
                    yield [
                        completion_message,
                        formatted_desc,
                        formatted_highlights,
                        gr.update(value=temp_output, visible=True),
                        gr.update(visible=True)
                    ]
                else:
                    yield [
                        "No highlights detected in the video with either set of criteria.",
                        formatted_desc,
                        formatted_highlights,
                        gr.update(visible=False),
                        gr.update(visible=True)
                    ]
            except Exception as e:
                logger.exception("Error processing video")
                yield [
                    f"Error processing video: {str(e)}",
                    "",
                    "",
                    gr.update(visible=False),
                    gr.update(visible=False)
                ]
            finally:
                # Clean up
                torch.cuda.empty_cache()
        process_btn.click(
            on_process,
            inputs=[input_video],
            outputs=[
                status,
                video_description,
                highlight_types,
                output_video,
                analysis_accordion
            ],
            queue=True,
        )
    return app







## Launch the Gradio Application

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
app = create_ui("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
app.queue().launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




IMPORTANT: You are using gradio version 3.50.0, however version 4.44.1 is available, please upgrade.
--------


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using prompt 1 for highlight detection
[{'role': 'system', 'content': [{'type': 'text', 'text': 'You are a highlight editor. List archetypal dramatic moments that would make compelling highlights if they appear in the video. Each moment should be specific enough to be recognizable but generic enough to potentially exist in other videos of this type.'}]}, {'role': 'user', 'content': [{'type': 'text', 'text': 'Here is a description of a video:\n\nthe video appears to be a demonstration of a medical device or equipment being handled by medical professionals. the individuals in the video are wearing blue lab coats and are handling various medical instruments, possibly including a stethoscope and a clipboard. the setting is a clinical or medical environment with white walls and a clean, clinical appearance.\n\nList potential highlight moments to look for in this video:'}]}]
Using prompt 2 for highlight detection
[{'role': 'system', 'content': [{'type': 'text', 'text': 'You are a helpful vis

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

[{'role': 'system', 'content': [{'type': 'text', 'text': 'You are a video highlight analyzer. Your role is to identify moments that have high dramatic value, focusing on displays of skill, emotion, personality, or tension. Compare video segments against provided example highlights to find moments with similar emotional impact and visual interest, even if the specific actions differ.'}]}, {'role': 'user', 'content': [{'type': 'video', 'path': 'temp_segments/segment_0.mp4'}, {'type': 'text', 'text': "Given these highlight examples:\nHighlights:\n\n1. A medical professional with a blue lab coat handling a stethoscope on a patient's chest.\n2. A medical professional taking a blood pressure reading on a patient's arm.\n3. A medical professional using a clipboard to record patient information, such as name, age, and medical history.\n4. A medical professional performing a procedure or examination on a patient, such as an electrocardiogram (ECG) or a physical exam.\n5. A medical professional 

INFO:__main__:Running ffmpeg command: ffmpeg -y -i /tmp/gradio/1c729b355deb782f0fd0f22ddcc54b6b9ebf53f8/test01.mp4 -filter_complex [0:v]trim=start=0:end=10.0,setpts=PTS-STARTPTS[v0];[0:a]atrim=start=0:end=10.0,asetpts=PTS-STARTPTS[a0];[v0][a0]concat=n=1:v=1:a=1[outv][outa] -map [outv] -map [outa] -c:v libx264 -c:a aac /tmp/tmp0txel9p9.mp4


Segment response no
Highlight set 1: 100.0% of video
Highlight set 2: 0.0% of video


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab