# Search Notebook
Use this notebook when you just need to load the previously indexed scene metadata + FAISS index
and run CLIP+FAISS searches without re-extracting scene embeddings.


## Environment
Ensure the environment matches whatever you used to build the index (PyTorch, transformers,
faiss-cpu, etc.). Uncomment the pip cell if you need to install packages quickly.


In [None]:
# Optional dependency install
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# %pip install transformers faiss-cpu pillow


In [1]:
import json
from pathlib import Path
from typing import Dict, List

import numpy as np
import torch
import torch.nn.functional as F
from transformers import CLIPModel, CLIPProcessor
import faiss


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PROJECT_ROOT = Path('.').resolve()
INDEX_ROOT = PROJECT_ROOT / 'notebook_artifacts' / 'index'

FAISS_INDEX_PATH = Path('faiss') / 'faiss.index'
FAISS_META_PATH = Path('faiss') / 'faiss_scene_meta.json'
FAISS_SCENE_IDS_PATH = Path('faiss')     / 'faiss_scene_ids.json'
VIDEO_METADATA_PATH = INDEX_ROOT / 'video_metadata.json'
SCENE_RECORDS_PATH = INDEX_ROOT / 'scene_records.json'

print('Index root:', INDEX_ROOT)


Index root: /home/khushpatel/shortform-video-ranker/notebook_artifacts/index


In [3]:
def load_json(path: Path) -> Dict:
    if not path.exists():
        raise FileNotFoundError(path)
    text = path.read_text()
    return json.loads(text) if text else {}

video_metadata = load_json(VIDEO_METADATA_PATH)
scene_records = load_json(SCENE_RECORDS_PATH)
faiss_scene_meta = []
faiss_scene_ids = []
faiss_index = None


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to(device)
print('Loaded CLIP on', device)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loaded CLIP on cpu


In [5]:
def load_faiss_artifacts(index_path=FAISS_INDEX_PATH, meta_path=FAISS_META_PATH, ids_path=FAISS_SCENE_IDS_PATH):
    global faiss_index, faiss_scene_meta, faiss_scene_ids
    faiss_index = faiss.read_index(str(index_path))
    faiss_scene_meta = json.loads(Path(meta_path).read_text())
    faiss_scene_ids = json.loads(Path(ids_path).read_text())
    return {'entries': len(faiss_scene_ids)}

faiss_stats = load_faiss_artifacts()
faiss_stats


{'entries': 23820}

In [6]:
def search_with_faiss(text: str, top_scene_hits: int = 30, top_videos: int = 5) -> Dict:
    if faiss_index is None:
        raise ValueError('FAISS index not loaded.')
    text_inputs = clip_processor(text=[text], return_tensors='pt', padding=True).to(device)
    with torch.inference_mode():
        text_features = clip_model.get_text_features(**text_inputs)
    text_features = F.normalize(text_features, dim=-1).cpu().numpy().astype('float32')
    k = min(top_scene_hits, faiss_index.ntotal)
    scores, indices = faiss_index.search(text_features, k)

    scene_hits = []
    for score, idx in zip(scores[0], indices[0]):
        if idx == -1:
            continue
        meta = faiss_scene_meta[idx]
        scene_hits.append({
            'scene_id': meta['scene_id'],
            'video_id': meta['video_id'],
            'score': float(score),
            'start_frame': meta.get('start_frame'),
            'end_frame': meta.get('end_frame'),
        })

    video_scores = {}
    for hit in scene_hits:
        vid = hit['video_id']
        current = video_scores.get(vid)
        if current is None or hit['score'] > current['score']:
            video_scores[vid] = hit

    ranked = sorted(video_scores.values(), key=lambda item: item['score'], reverse=True)[:top_videos]
    results = []
    for hit in ranked:
        md = video_metadata.get(hit['video_id'], {})
        results.append({
            'video_id': hit['video_id'],
            'video_uri': md.get('video_uri'),
            'video_duration': md.get('video_duration'),
            'score': hit['score'],
            'scene_id': hit['scene_id'],
            'scene_frames': (hit['start_frame'], hit['end_frame']),
        })

    return {'scene_hits': scene_hits, 'video_results': results}


In [8]:
query = 'a person talking to the camera'
faiss_search_results = search_with_faiss(query, top_scene_hits=30, top_videos=10)
faiss_search_results['video_results']


[{'video_id': '3ad0ae0c-6ae8-47e1-9753-5ca7caa89b54',
  'video_uri': '/home/khushpatel/shortform-video-ranker/data/1/TrainValVideo/video1758.mp4',
  'video_duration': 0.0,
  'score': 0.30197227001190186,
  'scene_id': '83cb72d8-e80b-4673-92ba-5877c12b0db5',
  'scene_frames': (0, 499)},
 {'video_id': '6a722f97-bc48-4548-ba7d-ac3530ed6f5a',
  'video_uri': '/home/khushpatel/shortform-video-ranker/data/1/TrainValVideo/video5990.mp4',
  'video_duration': 0.0,
  'score': 0.3013308644294739,
  'scene_id': '6015a9f8-58df-4eca-bfa8-b9cb9021e682',
  'scene_frames': (356, 360)},
 {'video_id': '958959dc-2f63-4799-a2fc-86a9a9c80290',
  'video_uri': '/home/khushpatel/shortform-video-ranker/data/1/TrainValVideo/video1050.mp4',
  'video_duration': 0.0,
  'score': 0.30076122283935547,
  'scene_id': '2e4f7788-cb01-4102-82df-b42341f2920a',
  'scene_frames': (418, 521)},
 {'video_id': '2f1b9870-0c66-45be-abd7-d3e0b5204948',
  'video_uri': '/home/khushpatel/shortform-video-ranker/data/1/TrainValVideo/video