# Search Notebook
Use this notebook when you just need to load the previously indexed scene metadata + FAISS index
and run CLIP+FAISS searches without re-extracting scene embeddings.


## Environment
Ensure the environment matches whatever you used to build the index (PyTorch, transformers,
faiss-cpu, etc.). Uncomment the pip cell if you need to install packages quickly.


In [None]:
# Optional dependency install
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# %pip install transformers faiss-cpu pillow


In [22]:
import os
import json
from pathlib import Path
from typing import Dict, List

import numpy as np
import torch
import torch.nn.functional as F
from transformers import CLIPModel, CLIPProcessor
import faiss
import google.generativeai as genai


In [23]:
PROJECT_ROOT = Path('.').resolve()
INDEX_ROOT = PROJECT_ROOT / 'notebook_artifacts' / 'index'

FAISS_INDEX_PATH = Path('faiss') / 'faiss.index'
FAISS_META_PATH = Path('faiss') / 'faiss_scene_meta.json'
FAISS_SCENE_IDS_PATH = Path('faiss')     / 'faiss_scene_ids.json'
VIDEO_METADATA_PATH = INDEX_ROOT / 'video_metadata.json'
SCENE_RECORDS_PATH = INDEX_ROOT / 'scene_records.json'
MSRVTT_CAPTIONS_PATH = PROJECT_ROOT / 'msrvtt_train_7k.json'

print('Index root:', INDEX_ROOT)


Index root: /home/khushpatel/shortform-video-ranker/notebook_artifacts/index


In [24]:
def load_json(path: Path) -> Dict:
    if not path.exists():
        raise FileNotFoundError(path)
    text = path.read_text()
    return json.loads(text) if text else {}

video_metadata = load_json(VIDEO_METADATA_PATH)
scene_records = load_json(SCENE_RECORDS_PATH)
faiss_scene_meta = []
faiss_scene_ids = []
faiss_index = None
gemini_model = None


In [25]:
def load_msrvtt_captions(path=MSRVTT_CAPTIONS_PATH):
    if not path.exists():
        print(f'Warning: captions file not found at {path}')
        return {}
    records = json.loads(path.read_text())
    caption_index = {}
    for item in records:
        captions = item.get('caption') or []
        if isinstance(captions, str):
            captions = [captions]
        keys = set()
        video_id = item.get('video_id')
        if video_id:
            keys.add(str(video_id))
        video_name = item.get('video')
        if video_name:
            name = Path(video_name).name
            keys.add(name)
            keys.add(Path(name).stem)
        for key in keys:
            existing = caption_index.setdefault(key, [])
            existing.extend(captions)
    for key, values in caption_index.items():
        deduped = []
        seen = set()
        for cap in values:
            if cap not in seen:
                deduped.append(cap)
                seen.add(cap)
        caption_index[key] = deduped
    return caption_index

msrvtt_captions = load_msrvtt_captions()
print('Loaded captions for', len(msrvtt_captions), 'videos')


Loaded captions for 14020 videos


In [26]:
def configure_gemini(api_key: str | None = None, model_name: str = 'gemini-1.5-pro'):
    global gemini_model
    key = api_key or os.getenv('GEMINI_API_KEY')
    if not key:
        raise ValueError('Provide a Gemini API key via argument or GEMINI_API_KEY env var.')
    genai.configure(api_key=key)
    gemini_model = genai.GenerativeModel(model_name)
    return {'model_name': model_name}


In [27]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to(device)
print('Loaded CLIP on', device)


Loaded CLIP on cpu


In [28]:
def load_faiss_artifacts(index_path=FAISS_INDEX_PATH, meta_path=FAISS_META_PATH, ids_path=FAISS_SCENE_IDS_PATH):
    global faiss_index, faiss_scene_meta, faiss_scene_ids
    faiss_index = faiss.read_index(str(index_path))
    faiss_scene_meta = json.loads(Path(meta_path).read_text())
    faiss_scene_ids = json.loads(Path(ids_path).read_text())
    return {'entries': len(faiss_scene_ids)}

faiss_stats = load_faiss_artifacts()
faiss_stats


{'entries': 23820}

In [29]:
def search_with_faiss(text: str, top_scene_hits: int = 30, top_videos: int = 5) -> Dict:
    if faiss_index is None:
        raise ValueError('FAISS index not loaded.')
    text_inputs = clip_processor(text=[text], return_tensors='pt', padding=True).to(device)
    with torch.inference_mode():
        text_features = clip_model.get_text_features(**text_inputs)
    text_features = F.normalize(text_features, dim=-1).cpu().numpy().astype('float32')
    k = min(top_scene_hits, faiss_index.ntotal)
    scores, indices = faiss_index.search(text_features, k)

    scene_hits = []
    for score, idx in zip(scores[0], indices[0]):
        if idx == -1:
            continue
        meta = faiss_scene_meta[idx]
        scene_hits.append({
            'scene_id': meta['scene_id'],
            'video_id': meta['video_id'],
            'score': float(score),
            'start_frame': meta.get('start_frame'),
            'end_frame': meta.get('end_frame'),
        })

    video_scores = {}
    for hit in scene_hits:
        vid = hit['video_id']
        current = video_scores.get(vid)
        if current is None or hit['score'] > current['score']:
            video_scores[vid] = hit

    ranked = sorted(video_scores.values(), key=lambda item: item['score'], reverse=True)[:top_videos]
    results = []
    for hit in ranked:
        md = video_metadata.get(hit['video_id'], {})
        results.append({
            'video_id': hit['video_id'],
            'video_uri': md.get('video_uri'),
            'video_duration': md.get('video_duration'),
            'score': hit['score'],
            'scene_id': hit['scene_id'],
            'scene_frames': (hit['start_frame'], hit['end_frame']),
        })

    return {'scene_hits': scene_hits, 'video_results': results}


In [30]:
def search_with_faiss_and_captions(text: str, top_scene_hits: int = 30, top_videos: int = 5):
    results = search_with_faiss(text, top_scene_hits=top_scene_hits, top_videos=top_videos)
    enriched_videos = []
    for hit in results['video_results']:
        keys = []
        vid = hit.get('video_id')
        if vid:
            keys.append(str(vid))
        video_uri = hit.get('video_uri')
        if video_uri:
            name = Path(video_uri).name
            keys.append(name)
            keys.append(Path(name).stem)
        captions = []
        for key in keys:
            matched = msrvtt_captions.get(key)
            if matched:
                captions = matched
                break
        enriched = dict(hit)
        enriched['captions'] = captions
        enriched_videos.append(enriched)
    return {'scene_hits': results['scene_hits'], 'video_results': enriched_videos}


In [40]:
def _strip_code_fences(text: str) -> str:
    trimmed = text.strip()
    if trimmed.startswith('```') and trimmed.endswith('```'):
        trimmed = trimmed[3:-3].strip()
        if trimmed.lower().startswith('json'):
            trimmed = trimmed[4:].strip()
    return trimmed

def rerank_with_gemini(search_output: Dict, query: str, top_results: int = 5, api_key: str | None = None, model_name: str = 'gemini-2.5-pro'):
    global gemini_model
    videos = search_output.get('video_results', [])
    if not videos:
        return {'reranked': [], 'raw_response': '', 'prompt': ''}
    if gemini_model is None or api_key is not None:
        configure_gemini(api_key=api_key, model_name=model_name)
    payload = []
    for idx, video in enumerate(videos, 1):
        payload.append({
            'rank': idx,
            'video_id': video.get('video_id'),
            'video_uri': video.get('video_uri'),
            'score': video.get('score'),
            'scene_frames': video.get('scene_frames'),
            'captions': video.get('captions', []),
        })
    instructions = f'Rank the best {top_results} video candidates for the query: "{query}".'
    prompt = (
        instructions +
        "\nReturn a JSON list named reranked, sorted best to worst. Each item must include video_id, video_uri, reason, and optionally cite captions.\n" +
        'Consider the semantic match between the query and provided captions. Here are the candidates:' +
        f"\n{json.dumps(payload, indent=2)}"
    )
    response = gemini_model.generate_content(prompt)
    text = getattr(response, 'text', '')
    if not text and hasattr(response, 'candidates'):
        text = ''.join(getattr(part, 'text', '') for part in response.candidates)
    cleaned = _strip_code_fences(text)
    reranked = []
    try:
        parsed = json.loads(cleaned)
        if isinstance(parsed, dict) and 'reranked' in parsed:
            reranked = parsed['reranked']
        elif isinstance(parsed, list):
            reranked = parsed
    except json.JSONDecodeError:
        print('Gemini response was not valid JSON. Returning raw text.')
    if reranked is None:
        reranked = []
    return {'reranked': reranked[:top_results], 'raw_response': text, 'prompt': prompt}


In [41]:
# import .env file if exists
from dotenv import load_dotenv
load_dotenv()
# access GEMINI_API_KEY via os.getenv('GEMINI_API_KEY') if needed
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

In [42]:
query = 'a person talking to the camera'
faiss_search_results = search_with_faiss(query, top_scene_hits=30, top_videos=10)
faiss_search_results['video_results']


[{'video_id': '3ad0ae0c-6ae8-47e1-9753-5ca7caa89b54',
  'video_uri': '/home/khushpatel/shortform-video-ranker/data/1/TrainValVideo/video1758.mp4',
  'video_duration': 0.0,
  'score': 0.30197227001190186,
  'scene_id': '83cb72d8-e80b-4673-92ba-5877c12b0db5',
  'scene_frames': (0, 499)},
 {'video_id': '6a722f97-bc48-4548-ba7d-ac3530ed6f5a',
  'video_uri': '/home/khushpatel/shortform-video-ranker/data/1/TrainValVideo/video5990.mp4',
  'video_duration': 0.0,
  'score': 0.3013308644294739,
  'scene_id': '6015a9f8-58df-4eca-bfa8-b9cb9021e682',
  'scene_frames': (356, 360)},
 {'video_id': '958959dc-2f63-4799-a2fc-86a9a9c80290',
  'video_uri': '/home/khushpatel/shortform-video-ranker/data/1/TrainValVideo/video1050.mp4',
  'video_duration': 0.0,
  'score': 0.30076122283935547,
  'scene_id': '2e4f7788-cb01-4102-82df-b42341f2920a',
  'scene_frames': (418, 521)},
 {'video_id': '2f1b9870-0c66-45be-abd7-d3e0b5204948',
  'video_uri': '/home/khushpatel/shortform-video-ranker/data/1/TrainValVideo/video

In [43]:
captioned_results = search_with_faiss_and_captions(query, top_scene_hits=30, top_videos=10)
captioned_results['video_results']


[{'video_id': '3ad0ae0c-6ae8-47e1-9753-5ca7caa89b54',
  'video_uri': '/home/khushpatel/shortform-video-ranker/data/1/TrainValVideo/video1758.mp4',
  'video_duration': 0.0,
  'score': 0.30197227001190186,
  'scene_id': '83cb72d8-e80b-4673-92ba-5877c12b0db5',
  'scene_frames': (0, 499),
  'captions': ['a student is talking about how to ask questions in class or privately with the teacher',
   'a man with glasses and brown hair is giving advice for asking questions in class',
   'i see a man with glasses talking about important things',
   'the man wearing eyeglasses and blue shirt talks',
   'a young man with brown hair and glasses giving points of in formation for new students',
   'a student with glasses giving advice to new students',
   'a man wearing a gray shirt and glasses discusses about asking questions',
   'a man wearing glasses and a gray collared shirt talking to a reporter',
   'a man wears specs talking of something to media',
   'a young man in a blue shirt wearing glasse

In [44]:
# Example: rerank with Gemini once captioned_results is available
gemini_rerank = rerank_with_gemini(captioned_results, query, top_results=5, api_key=GEMINI_API_KEY)
gemini_rerank['reranked']


[{'video_id': 'd7fd3d26-8c92-419e-aa54-d658a79c7fc6',
  'video_uri': '/home/khushpatel/shortform-video-ranker/data/1/TrainValVideo/video1074.mp4',
  'reason': 'This is the best candidate because the captions explicitly state the person is talking directly to the viewer or camera. This is a perfect semantic match for the query.',
  'cite captions': ['a plate of food appears a woman in a green shirt is speaking toward a camera',
   'a woman in a blue shirt with brunette hair is talking to her program viewers',
   'a spokeswoman tells her viewing audience to make the dish that she earlier presents']},
 {'video_id': '3ad0ae0c-6ae8-47e1-9753-5ca7caa89b54',
  'video_uri': '/home/khushpatel/shortform-video-ranker/data/1/TrainValVideo/video1758.mp4',
  'reason': 'This video is a very strong match as it consistently describes a single person talking and giving advice, a format that inherently involves addressing the camera.',
  'cite captions': ['a student is talking about how to ask questions 