<a href="https://colab.research.google.com/github/harpreetsahota204/qwen3vl_video/blob/main/qwen3vl_fiftyone_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Qwen3-VL Video Model for FiftyOne

This notebook demonstrates how to use the Qwen3-VL video understanding model with FiftyOne for:
- Video description and analysis
- Temporal event detection
- OCR and object tracking
- Video embeddings and similarity search


## Installation


In [None]:
!pip install -q fiftyone decord qwen-vl-utils transformers torch torchvision


## Setup


In [None]:
import fiftyone as fo
import fiftyone.zoo as foz

# Register the model source
foz.register_zoo_model_source(
    "https://github.com/harpreetsahota204/qwen3vl_video",
    overwrite=True
)


## Load Dataset


In [None]:
# Load a sample video dataset
dataset = foz.load_zoo_dataset("quickstart-video", max_samples=5)

# Compute metadata (required for temporal operations)
dataset.compute_metadata()

## Load Model


In [None]:
# Load Qwen3-VL model
model = foz.load_zoo_model("Qwen/Qwen3-VL-8B-Instruct")

# Note: First load will download the model (~16GB)


## Video Description


In [None]:
model.operation = "description"
dataset.apply_model(model, label_field="description", skip_failures=True)

# View a sample description
sample = dataset.first()
print(sample.description_summary)


## Comprehensive Analysis


In [None]:
model.operation = "comprehensive"
dataset.apply_model(model, label_field="analysis", skip_failures=True)

print("Analysis complete!")
print(f"Fields added: {[f for f in dataset.get_field_schema().keys() if f.startswith('analysis')]}")


## Temporal Event Detection


In [None]:
model.operation = "temporal_localization"
dataset.apply_model(model, label_field="events", skip_failures=True)


## Video OCR


In [None]:
model.operation = "ocr"
dataset.apply_model(model, label_field="ocr", skip_failures=True)

print("OCR complete!")


## Video Embeddings


In [None]:
# Configure pooling strategy
model.pooling_strategy = "mean"

# Compute embeddings
dataset.compute_embeddings(
    model,
    embeddings_field="qwen_embeddings",
    skip_failures=True
)

print("Embeddings computed!")


## Video Similarity


In [None]:
import fiftyone.brain as fob

# Build similarity index
fob.compute_similarity(
    dataset,
    brain_key="qwen_similarity",
    embeddings="qwen_embeddings"
)

# Find similar videos
query_sample = dataset.first()
similar_view = dataset.sort_by_similarity(
    query_sample,
    k=5,
    brain_key="qwen_similarity"
)

print(f"Found {len(similar_view)} similar videos")


## UMAP Visualization


In [None]:
# Compute UMAP visualization
results = fob.compute_visualization(
    dataset,
    method="umap",
    brain_key="qwen_viz",
    embeddings="qwen_embeddings",
    num_dims=2
)

print("UMAP visualization computed!")


## Launch FiftyOne App


In [None]:
session = fo.launch_app(dataset, auto=False)
session.url


## Custom Prompts (Optional)


In [None]:
model.operation = "custom"
model.custom_prompt = """Analyze this video and provide:
{
  "content_type": "educational/entertainment/promotional/other",
  "mood": "calm/energetic/dramatic/neutral"
}
"""

dataset.apply_model(model, label_field="custom_analysis", skip_failures=True)

# View results
sample = dataset.first()
print(sample.custom_analysis_result)
