In [None]:
# ============================================================
# Cell 1: Install core libraries + basic imports
# ------------------------------------------------------------
# This cell:
#   1. Installs all the external Python packages we need.
#   2. Imports commonly used standard libraries.
#   3. Does NOT touch your data or Google Drive yet.
#
# You should run this first in a fresh Colab runtime.
# ============================================================

# ---- 1. Install external packages (quietly) -----------------
# faiss-cpu           -> vector search / ANN index
# sentence-transformers -> text embeddings for lectures + papers
# clip-anytorch       -> CLIP model for image <-> text alignment
# openai-whisper      -> automatic speech recognition for lectures
# PyMuPDF             -> robust PDF text extraction
# tqdm                -> progress bars for long loops

!pip -q install faiss-cpu sentence-transformers clip-anytorch \
                 openai-whisper PyMuPDF==1.24.10 tqdm

print("‚úÖ Installed external packages.")

# ---- 2. Standard library imports ----------------------------
import os
import json
import math
from pathlib import Path
from typing import List, Dict, Any, Optional

# ---- 3. Numeric + ML utilities ------------------------------
import numpy as np
import torch

# We won't load any heavy models yet. That comes later in the
# embeddings & transcription stages. For now, just confirm setup.
print("‚úÖ Core Python modules imported.")
print("Python version:", torch.__version__, "(PyTorch version printed here)")


[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m803.2/803.2 kB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.5/3.5 MB[0m [31m93.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m15.9/15.9 MB[0m [31m91.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [None]:
# ============================================================
# Cell 2: Mount Google Drive + create UNISEARCH_MASTER project
# ------------------------------------------------------------
# This cell:
#   1. Mounts your Google Drive.
#   2. Creates a NEW isolated project directory:
#           /MyDrive/UNISEARCH_MASTER
#      so nothing interferes with previous attempts.
#   3. Sets up the full folder structure for the entire pipeline.
#   4. Scans raw/videos and raw/papers to show what data is present.
#   5. Writes a config.json "master reference" for Colab 1 & 2.
# ============================================================

from google.colab import drive
from pathlib import Path
import json

# ---- 1. Mount Google Drive ---------------------------------
print("üîå Mounting Google Drive...")
drive.mount('/content/drive')

# ---- 2. Define the NEW unique project root -----------------
BASE_DIR = Path("/content/drive/MyDrive/UNISEARCH_MASTER")

# ---- 3. Define subdirectories ------------------------------
RAW_DIR         = BASE_DIR / "raw"
RAW_VIDEOS      = RAW_DIR / "videos"
RAW_PAPERS      = RAW_DIR / "papers"

PROC_DIR        = BASE_DIR / "processed"
KEYFRAMES_DIR   = PROC_DIR / "keyframes"
TRANSCRIPTS_DIR = PROC_DIR / "transcripts"
MANIFESTS_DIR   = PROC_DIR / "manifests"
EMB_DIR         = PROC_DIR / "embeddings"
INDICES_DIR     = PROC_DIR / "indices"

# Create all directories
for d in [
    RAW_VIDEOS, RAW_PAPERS,
    KEYFRAMES_DIR, TRANSCRIPTS_DIR,
    MANIFESTS_DIR, EMB_DIR, INDICES_DIR
]:
    d.mkdir(parents=True, exist_ok=True)

print("üìÅ Project root directory:", BASE_DIR)
print("üìÅ Raw videos directory   :", RAW_VIDEOS)
print("üìÅ Raw papers directory   :", RAW_PAPERS)
print("üìÅ Processed directory    :", PROC_DIR)

# ---- 4. Inventory of raw data ------------------------------
VIDEO_EXTS = [".mp4", ".mkv", ".avi", ".mov", ".webm", ".m4v"]

video_files = [
    p for p in RAW_VIDEOS.iterdir()
    if p.is_file() and p.suffix.lower() in VIDEO_EXTS
]

paper_files = list(RAW_PAPERS.glob("*.pdf"))

print(f"\nüé• Found {len(video_files)} video files in raw/videos")
for v in video_files[:10]:
    print("   -", v.name)
if len(video_files) > 10:
    print(f"   ... and {len(video_files) - 10} more")

print(f"\nüìÑ Found {len(paper_files)} PDF files in raw/papers")
for p in paper_files[:10]:
    print("   -", p.name)
if len(paper_files) > 10:
    print(f"   ... and {len(paper_files) - 10} more")

# ---- 5. Write a fresh config.json ---------------------------
config_path = BASE_DIR / "config.json"

config = {
    "paths": {
        "base_dir": str(BASE_DIR),
        "raw_videos": str(RAW_VIDEOS),
        "raw_papers": str(RAW_PAPERS),
        "processed": str(PROC_DIR),
        "keyframes": str(KEYFRAMES_DIR),
        "transcripts": str(TRANSCRIPTS_DIR),
        "manifests": str(MANIFESTS_DIR),
        "embeddings": str(EMB_DIR),
        "indices": str(INDICES_DIR),
    },
    "models": {
        # Primary semantic text model (for lecture + paper text retrieval)
        "text_semantic_model": "sentence-transformers/all-mpnet-base-v2",
        # Cross-modal CLIP model
        "clip_model": "openai/clip-vit-base-patch32"
    },
    "embedding_dims": {
        # We will fill these in AFTER loading models (later cells)
        "text_semantic_dim": None,
        "clip_dim": None
    }
}

with open(config_path, "w") as f:
    json.dump(config, f, indent=2)

print("\n‚úÖ config.json written to:", config_path)


üîå Mounting Google Drive...
Mounted at /content/drive
üìÅ Project root directory: /content/drive/MyDrive/UNISEARCH_MASTER
üìÅ Raw videos directory   : /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos
üìÅ Raw papers directory   : /content/drive/MyDrive/UNISEARCH_MASTER/raw/papers
üìÅ Processed directory    : /content/drive/MyDrive/UNISEARCH_MASTER/processed

üé• Found 0 video files in raw/videos

üìÑ Found 42 PDF files in raw/papers
   - alexnet_2012_imagenet.pdf
   - vgg_2014_very_deep.pdf
   - googlenet_2014_inception.pdf
   - resnet_2015_deep_residual.pdf
   - densenet_2016.pdf
   - fcn_2014_fully_conv_networks.pdf
   - mask_rcnn_2017.pdf
   - yolov1_2015.pdf
   - yolov3_2018.pdf
   - mobilenet_v1_2017.pdf
   ... and 32 more

‚úÖ config.json written to: /content/drive/MyDrive/UNISEARCH_MASTER/config.json


In [None]:
# ==============================================
# CELL 3 ‚Äî FULL DATASET LISTING
# ----------------------------------------------
# This cell:
#   - Loads config.json
#   - Lists ALL lecture videos (every filename) in non-empty course folders
#   - Lists ALL research paper PDFs (every filename)
# ==============================================

from pathlib import Path
import json

PROJECT_ROOT = Path("/content/drive/MyDrive/UNISEARCH_MASTER")
VIDEOS_ROOT = PROJECT_ROOT / "raw/videos"
PAPERS_ROOT = PROJECT_ROOT / "raw/papers"
CONFIG_PATH = PROJECT_ROOT / "config.json"

print("üìÅ PROJECT ROOT:", PROJECT_ROOT)
print("üìÅ VIDEOS ROOT :", VIDEOS_ROOT)
print("üìÅ PAPERS ROOT :", PAPERS_ROOT)

# -------------------------------
# 1. Load config.json
# -------------------------------
print("\nüìÑ Checking config.json...")
if CONFIG_PATH.exists():
    with open(CONFIG_PATH, "r") as f:
        config = json.load(f)
    print("   ‚úì config.json loaded successfully")
else:
    print("   ‚ùå config.json NOT found ‚Äî go back to Cell 2")

# -------------------------------
# 2. List ALL videos (by course)
# -------------------------------
print("\nüé• Listing ALL lecture videos (non-empty course folders only)...")

if not VIDEOS_ROOT.exists():
    print("   ‚ùå raw/videos folder missing!")
else:
    course_folders = [d for d in VIDEOS_ROOT.iterdir() if d.is_dir()]
    valid_courses = []

    for course in sorted(course_folders):
        vids = sorted([v for v in course.iterdir() if v.is_file()])
        if len(vids) > 0:
            valid_courses.append((course, vids))

    if not valid_courses:
        print("   ‚ùå No NON-EMPTY course folders found under raw/videos")
    else:
        print(f"   ‚úì Found {len(valid_courses)} non-empty course folders.\n")
        for course, vids in valid_courses:
            print(f"==============================")
            print(f"‚ñ∫ {course.name} ‚Äî {len(vids)} videos")
            print(f"==============================")
            for v in vids:
                print("   ‚Ä¢", v.name)
            print()  # blank line between courses

# -------------------------------
# 3. List ALL research papers
# -------------------------------
print("\nüìö Listing ALL research papers (PDFs)...")

if not PAPERS_ROOT.exists():
    print("   ‚ùå raw/papers folder missing!")
else:
    papers = sorted([p for p in PAPERS_ROOT.iterdir() if p.suffix.lower() == ".pdf"])
    print(f"   ‚úì Found {len(papers)} papers:\n")
    for p in papers:
        print("   ‚Ä¢", p.name)

print("\n‚úÖ FULL DATASET LISTING COMPLETE.")


üìÅ PROJECT ROOT: /content/drive/MyDrive/UNISEARCH_MASTER
üìÅ VIDEOS ROOT : /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos
üìÅ PAPERS ROOT : /content/drive/MyDrive/UNISEARCH_MASTER/raw/papers

üìÑ Checking config.json...
   ‚úì config.json loaded successfully

üé• Listing ALL lecture videos (non-empty course folders only)...
   ‚úì Found 2 non-empty course folders.

‚ñ∫ CS229 ‚Äî 20 videos
   ‚Ä¢ Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_01_jGwO_UgTS7I.webm
   ‚Ä¢ Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_02_4b4MUYve_U8.webm
   ‚Ä¢ Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_03_het9HFqo1TQ.webm
   ‚Ä¢ Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_04_iZTeva0WSTQ.webm
   ‚Ä¢ Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_05_nt63k3bfXS0.webm
   ‚Ä¢ Stanford CS229Ôºö Machine Learning Full C

In [8]:
# ==============================================
# CELL 4 ‚Äî Build Manifests for Lectures & Papers
# ----------------------------------------------
# This cell scans:
#   - raw/videos/<course>/<video_file>
#   - raw/papers/*.pdf
#
# And builds:
#   - processed/manifests/video_manifest.jsonl
#   - processed/manifests/paper_manifest.jsonl
#
# Each line is a JSON object (JSONL format) with a stable ID and
# a relative file path (from the project root).
# ==============================================

from pathlib import Path
import json
import re

PROJECT_ROOT = Path("/content/drive/MyDrive/UNISEARCH_MASTER")
RAW_VIDEOS = PROJECT_ROOT / "raw" / "videos"
RAW_PAPERS = PROJECT_ROOT / "raw" / "papers"
MANIFEST_DIR = PROJECT_ROOT / "processed" / "manifests"
MANIFEST_DIR.mkdir(parents=True, exist_ok=True)

VIDEO_MANIFEST_PATH = MANIFEST_DIR / "video_manifest.jsonl"
PAPER_MANIFEST_PATH = MANIFEST_DIR / "paper_manifest.jsonl"

def slugify(text: str) -> str:
    """
    Turn arbitrary text (like a filename stem) into a safe ID:
    - lowercase
    - spaces -> _
    - keep letters, numbers, _, -
    - drop everything else (colons, weird unicode, etc.)
    """
    text = text.lower()
    text = text.replace(" ", "_")
    text = re.sub(r"[^a-z0-9_\-]+", "", text)
    return text

# -------------------------------
# 1. Build video_manifest.jsonl
# -------------------------------
video_records = []
video_exts = {".mp4", ".mkv", ".avi", ".mov", ".webm", ".m4v"}

if not RAW_VIDEOS.exists():
    raise FileNotFoundError(f"‚ùå raw/videos folder not found at {RAW_VIDEOS}")

for course_dir in sorted(RAW_VIDEOS.iterdir()):
    if not course_dir.is_dir():
        continue
    course_name = course_dir.name  # e.g. "MIT_6_034" or "CS229"

    videos = sorted(
        [v for v in course_dir.iterdir() if v.is_file() and v.suffix.lower() in video_exts]
    )
    for idx, v in enumerate(videos, start=1):
        # Create a stable ID like: mit_6_034__01_tjzbt...
        base_stem = slugify(v.stem)
        video_id = f"{slugify(course_name)}__{idx:02d}_{base_stem[:40]}"

        # Store path relative to project root (portable)
        rel_path = v.relative_to(PROJECT_ROOT)

        record = {
            "video_id": video_id,
            "course": course_name,
            "file_path": str(rel_path),
            "file_name": v.name,
            "index_in_course": idx,
        }
        video_records.append(record)

with VIDEO_MANIFEST_PATH.open("w", encoding="utf-8") as f:
    for rec in video_records:
        f.write(json.dumps(rec) + "\n")

print(f"üé• Wrote video manifest: {VIDEO_MANIFEST_PATH}")
print(f"   Total videos indexed: {len(video_records)}")

# -------------------------------
# 2. Build paper_manifest.jsonl
# -------------------------------
paper_records = []

if not RAW_PAPERS.exists():
    raise FileNotFoundError(f"‚ùå raw/papers folder not found at {RAW_PAPERS}")

papers = sorted([p for p in RAW_PAPERS.iterdir() if p.suffix.lower() == ".pdf"])

for idx, p in enumerate(papers, start=1):
    # Example: paper_id = "resnet_2015_deep_residual"
    base_stem = slugify(p.stem)
    paper_id = f"paper_{idx:03d}_{base_stem[:40]}"

    rel_path = p.relative_to(PROJECT_ROOT)

    # A simple "title" derived from filename (underscores -> spaces)
    naive_title = p.stem.replace("_", " ")

    record = {
        "paper_id": paper_id,
        "file_path": str(rel_path),
        "file_name": p.name,
        "title_guess": naive_title,
        "index": idx,
    }
    paper_records.append(record)

with PAPER_MANIFEST_PATH.open("w", encoding="utf-8") as f:
    for rec in paper_records:
        f.write(json.dumps(rec) + "\n")

print(f"üìö Wrote paper manifest: {PAPER_MANIFEST_PATH}")
print(f"   Total papers indexed: {len(paper_records)}")

# -------------------------------
# 3. Quick peek at a few records
# -------------------------------
print("\nüîé Sample video records:")
for rec in video_records[:3]:
    print("  ", rec)

print("\nüîé Sample paper records:")
for rec in paper_records[:3]:
    print("  ", rec)

print("\n‚úÖ CELL 4 complete (manifests built).")


üé• Wrote video manifest: /content/drive/MyDrive/UNISEARCH_MASTER/processed/manifests/video_manifest.jsonl
   Total videos indexed: 44
üìö Wrote paper manifest: /content/drive/MyDrive/UNISEARCH_MASTER/processed/manifests/paper_manifest.jsonl
   Total papers indexed: 42

üîé Sample video records:
   {'video_id': 'cs229__01_stanford_cs229_machine_learning_full_cou', 'course': 'CS229', 'file_path': 'raw/videos/CS229/Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_01_jGwO_UgTS7I.webm', 'file_name': 'Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_01_jGwO_UgTS7I.webm', 'index_in_course': 1}
   {'video_id': 'cs229__02_stanford_cs229_machine_learning_full_cou', 'course': 'CS229', 'file_path': 'raw/videos/CS229/Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_02_4b4MUYve_U8.webm', 'file_name': 'Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_02_4b4MUYve_

In [9]:
# ==============================================
# CELL 5 ‚Äî Transcribe Lectures with Whisper
# ----------------------------------------------
# Uses:
#   - processed/manifests/video_manifest.jsonl
#
# Produces:
#   - processed/transcripts/{video_id}.json
#
# Each transcript file is a JSON with:
#   {
#     "video_id": ...,
#     "course": ...,
#     "file_path": ...,
#     "whisper_model": ...,
#     "segments": [
#        {"id": 0, "start": ..., "end": ..., "text": "..."},
#        ...
#     ]
#   }
#
# Safe to re-run:
#   - Skips videos that already have a transcript JSON file.
# ==============================================

import json
import subprocess
from pathlib import Path
import whisper

PROJECT_ROOT = Path("/content/drive/MyDrive/UNISEARCH_MASTER")
VIDEO_MANIFEST_PATH = PROJECT_ROOT / "processed" / "manifests" / "video_manifest.jsonl"
TRANSCRIPTS_DIR = PROJECT_ROOT / "processed" / "transcripts"
TRANSCRIPTS_DIR.mkdir(parents=True, exist_ok=True)

AUDIO_CACHE = Path("/content/audio_cache")
AUDIO_CACHE.mkdir(exist_ok=True)

# Choose Whisper model:
#   "base"  ‚Üí faster, lower quality
#   "small" ‚Üí balance
#   "medium"/"large" ‚Üí slower, best quality (expensive)
WHISPER_MODEL_NAME = "small"

print(f"üéß Loading Whisper model: {WHISPER_MODEL_NAME} ...")
whisper_model = whisper.load_model(WHISPER_MODEL_NAME)
print("   ‚úì Model loaded")

# -------------------------------
# 1. Load video manifest
# -------------------------------
if not VIDEO_MANIFEST_PATH.exists():
    raise FileNotFoundError(f"‚ùå video_manifest.jsonl not found at {VIDEO_MANIFEST_PATH}")

video_records = []
with VIDEO_MANIFEST_PATH.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        video_records.append(json.loads(line))

print(f"\nüìù Loaded {len(video_records)} video records from manifest.")

# -------------------------------
# 2. Helper: extract audio with ffmpeg
# -------------------------------
def extract_audio(input_video: Path, output_audio: Path):
    """
    Extracts audio track from the video using ffmpeg.
    Output is a mono 16kHz wav, which Whisper likes.
    """
    if output_audio.exists():
        return

    cmd = [
        "ffmpeg",
        "-y",  # overwrite
        "-i", str(input_video),
        "-ac", "1",          # mono
        "-ar", "16000",      # 16kHz
        "-vn",               # no video
        "-f", "wav",
        str(output_audio),
    ]
    print(f"   üéôÔ∏è  Extracting audio: {input_video.name} -> {output_audio.name}")
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode != 0:
        print("   ‚ùå ffmpeg failed")
        print("   STDERR (truncated):", result.stderr[:400])
        raise RuntimeError(f"ffmpeg failed for {input_video}")

# -------------------------------
# 3. Main loop: transcribe each video
# -------------------------------
for i, rec in enumerate(video_records, start=1):
    video_id = rec["video_id"]
    rel_path = rec["file_path"]
    video_path = PROJECT_ROOT / rel_path

    out_json = TRANSCRIPTS_DIR / f"{video_id}.json"

    print(f"\n[{i}/{len(video_records)}] üé¨ {video_id}")
    print(f"   Video file:", video_path)

    if out_json.exists():
        print("   ‚è© Transcript already exists, skipping.")
        continue

    if not video_path.exists():
        print("   ‚ùå Video file missing, skipping.")
        continue

    # 3.1 Extract audio to /content/audio_cache
    audio_path = AUDIO_CACHE / f"{video_id}.wav"
    extract_audio(video_path, audio_path)

    # 3.2 Run Whisper transcription
    print("   üß† Running Whisper transcription...")
    result = whisper_model.transcribe(str(audio_path), language="en", verbose=False)

    # 3.3 Build a compact transcript structure
    segments = []
    for seg in result.get("segments", []):
        segments.append(
            {
                "id": seg.get("id"),
                "start": seg.get("start"),
                "end": seg.get("end"),
                "text": seg.get("text"),
            }
        )

    transcript_record = {
        "video_id": video_id,
        "course": rec.get("course"),
        "file_path": rel_path,
        "whisper_model": WHISPER_MODEL_NAME,
        "segments": segments,
    }

    with out_json.open("w", encoding="utf-8") as f:
        json.dump(transcript_record, f, ensure_ascii=False, indent=2)

    print(f"   ‚úì Saved transcript:", out_json.name)
    print(f"   ‚Ü™ Segments: {len(segments)}")

print("\n‚úÖ CELL 5 complete ‚Äî transcripts generated (or skipped if existing).")


üéß Loading Whisper model: small ...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 461M/461M [00:01<00:00, 315MiB/s]


   ‚úì Model loaded

üìù Loaded 44 video records from manifest.

[1/44] üé¨ cs229__01_stanford_cs229_machine_learning_full_cou
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/CS229/Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_01_jGwO_UgTS7I.webm
   ‚è© Transcript already exists, skipping.

[2/44] üé¨ cs229__02_stanford_cs229_machine_learning_full_cou
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/CS229/Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_02_4b4MUYve_U8.webm
   ‚è© Transcript already exists, skipping.

[3/44] üé¨ cs229__03_stanford_cs229_machine_learning_full_cou
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/CS229/Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_03_het9HFqo1TQ.webm
   ‚è© Transcript already exists, skipping.

[4/44] üé¨ cs229__04_stanford_cs229_machine_learning_full_cou
   Video file: /conten

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 289619/289619 [02:43<00:00, 1776.26frames/s]


   ‚úì Saved transcript: mit_6_034__05_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 867

[26/44] üé¨ mit_6_034__06_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_07_l-tzjenXrvI.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_07_l-tzjenXrvI.mkv -> mit_6_034__06_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 295210/295210 [02:35<00:00, 1893.22frames/s]


   ‚úì Saved transcript: mit_6_034__06_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 631

[27/44] üé¨ mit_6_034__07_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_08_dARl_gGrS4o.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_08_dARl_gGrS4o.mkv -> mit_6_034__07_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 270589/270589 [02:14<00:00, 2016.36frames/s]


   ‚úì Saved transcript: mit_6_034__07_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 616

[28/44] üé¨ mit_6_034__08_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_09_gvmfbePC2pc.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_09_gvmfbePC2pc.mkv -> mit_6_034__08_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 308548/309191 [02:44<00:00, 1881.29frames/s]


   ‚úì Saved transcript: mit_6_034__08_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 605

[29/44] üé¨ mit_6_034__09_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_10_09mb78oiPkA.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_10_09mb78oiPkA.mkv -> mit_6_034__09_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 298022/299531 [02:54<00:00, 1711.53frames/s]


   ‚úì Saved transcript: mit_6_034__09_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 678

[30/44] üé¨ mit_6_034__10_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_11_SXBG3RGr_Rc.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_11_SXBG3RGr_Rc.mkv -> mit_6_034__10_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 297667/297667 [02:39<00:00, 1868.80frames/s]


   ‚úì Saved transcript: mit_6_034__10_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 596

[31/44] üé¨ mit_6_034__11_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_12_uXt8qF2Zzfo.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_12_uXt8qF2Zzfo.mkv -> mit_6_034__11_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 304246/304246 [02:45<00:00, 1841.20frames/s]


   ‚úì Saved transcript: mit_6_034__11_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 883

[32/44] üé¨ mit_6_034__12_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_13_VrMHA3yX_QI.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_13_VrMHA3yX_QI.mkv -> mit_6_034__12_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 294528/294528 [02:34<00:00, 1904.67frames/s]


   ‚úì Saved transcript: mit_6_034__12_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 814

[33/44] üé¨ mit_6_034__13_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_14_kHyNqSnzP8Y.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_14_kHyNqSnzP8Y.mkv -> mit_6_034__13_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 283595/283595 [02:20<00:00, 2012.74frames/s]


   ‚úì Saved transcript: mit_6_034__13_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 579

[34/44] üé¨ mit_6_034__14_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_15_L73hY1pBcQI.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_15_L73hY1pBcQI.mkv -> mit_6_034__14_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 286888/286888 [02:19<00:00, 2063.06frames/s]


   ‚úì Saved transcript: mit_6_034__14_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 494

[35/44] üé¨ mit_6_034__15_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_16_sh3EPjhhd40.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_16_sh3EPjhhd40.mkv -> mit_6_034__15_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 281348/281348 [02:42<00:00, 1732.72frames/s]


   ‚úì Saved transcript: mit_6_034__15_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 737

[36/44] üé¨ mit_6_034__16_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_17__PwhiWxHK8o.webm
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_17__PwhiWxHK8o.webm -> mit_6_034__16_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 297386/297386 [02:30<00:00, 1975.19frames/s]


   ‚úì Saved transcript: mit_6_034__16_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 598

[37/44] üé¨ mit_6_034__17_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_18_UHBmv7qCey4.mp4
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_18_UHBmv7qCey4.mp4 -> mit_6_034__17_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 309772/310000 [02:48<00:00, 1837.56frames/s]


   ‚úì Saved transcript: mit_6_034__17_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 578

[38/44] üé¨ mit_6_034__18_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_19_bQI0OmJPby4.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_19_bQI0OmJPby4.mkv -> mit_6_034__18_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 293756/293756 [02:22<00:00, 2067.11frames/s]


   ‚úì Saved transcript: mit_6_034__18_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 559

[39/44] üé¨ mit_6_034__19_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_20_PimSbFGrwXM.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_20_PimSbFGrwXM.mkv -> mit_6_034__19_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 294561/294561 [02:21<00:00, 2076.62frames/s]


   ‚úì Saved transcript: mit_6_034__19_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 518

[40/44] üé¨ mit_6_034__20_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_21_A6Ud6oUCRak.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_21_A6Ud6oUCRak.mkv -> mit_6_034__20_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 290910/290910 [02:24<00:00, 2013.54frames/s]


   ‚úì Saved transcript: mit_6_034__20_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 456

[41/44] üé¨ mit_6_034__21_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_22_EC6bf8JCpDQ.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_22_EC6bf8JCpDQ.mkv -> mit_6_034__21_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 292543/292543 [02:46<00:00, 1755.95frames/s]


   ‚úì Saved transcript: mit_6_034__21_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 604

[42/44] üé¨ mit_6_034__22_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_23_XPEJg_6Cg6o.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_23_XPEJg_6Cg6o.mkv -> mit_6_034__22_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 297018/297018 [02:54<00:00, 1704.65frames/s]


   ‚úì Saved transcript: mit_6_034__22_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 940

[43/44] üé¨ mit_6_034__23_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_24_iusTmgQyZ44.mkv
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_24_iusTmgQyZ44.mkv -> mit_6_034__23_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 281769/281769 [03:19<00:00, 1409.01frames/s]


   ‚úì Saved transcript: mit_6_034__23_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 1098

[44/44] üé¨ mit_6_034__24_mit_6034_artificial_intelligence_fall_20
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/MIT_6_034/MIT 6.034 Artificial Intelligence, Fall 2010_25_Tl_p5pgBsyM.mp4
   üéôÔ∏è  Extracting audio: MIT 6.034 Artificial Intelligence, Fall 2010_25_Tl_p5pgBsyM.mp4 -> mit_6_034__24_mit_6034_artificial_intelligence_fall_20.wav
   üß† Running Whisper transcription...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 311565/311565 [03:42<00:00, 1402.01frames/s]

   ‚úì Saved transcript: mit_6_034__24_mit_6034_artificial_intelligence_fall_20.json
   ‚Ü™ Segments: 1137

‚úÖ CELL 5 complete ‚Äî transcripts generated (or skipped if existing).





In [10]:
# ==============================================
# CELL 6 ‚Äî Extract Keyframes from Lectures
# ----------------------------------------------
# Uses:
#   - processed/manifests/video_manifest.jsonl
#
# Produces:
#   - processed/keyframes/{video_id}/frame_000001.jpg, frame_000002.jpg, ...
#   - processed/manifests/keyframes_manifest.jsonl
#
# Design:
#   - Sample 1 frame every KEYFRAME_EVERY_SEC seconds (we use 5s).
#   - For each frame we store:
#       video_id, frame_id, image_path, approx_timestamp_sec, index_in_video
#
# Resume behavior:
#   - If processed/keyframes/{video_id} already has frame_*.jpg,
#     we SKIP extraction for that video (but still write manifest entries).
# ==============================================

import json
import subprocess
from pathlib import Path
import re

PROJECT_ROOT = Path("/content/drive/MyDrive/UNISEARCH_MASTER")

VIDEO_MANIFEST_PATH = PROJECT_ROOT / "processed" / "manifests" / "video_manifest.jsonl"
KEYFRAMES_ROOT = PROJECT_ROOT / "processed" / "keyframes"
KEYFRAMES_ROOT.mkdir(parents=True, exist_ok=True)

KEYFRAMES_MANIFEST_PATH = PROJECT_ROOT / "processed" / "manifests" / "keyframes_manifest.jsonl"

# We agreed on 5 seconds between keyframes
KEYFRAME_EVERY_SEC = 5

# -------------------------------
# 1. Load video manifest
# -------------------------------
if not VIDEO_MANIFEST_PATH.exists():
    raise FileNotFoundError(f"‚ùå video_manifest.jsonl not found at {VIDEO_MANIFEST_PATH}")

video_records = []
with VIDEO_MANIFEST_PATH.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        video_records.append(json.loads(line))

print(f"üé¨ Loaded {len(video_records)} videos from manifest.")

# -------------------------------
# 2. Helper: extract frames with ffmpeg
# -------------------------------
def extract_keyframes_for_video(video_path: Path, out_dir: Path, every_sec: int):
    """
    Use ffmpeg to sample 1 frame every 'every_sec' seconds.
    Saves frames as: frame_000001.jpg, frame_000002.jpg, ...
    If frames already exist, we skip re-extraction for this video.
    """
    out_dir.mkdir(parents=True, exist_ok=True)

    existing = list(out_dir.glob("frame_*.jpg"))
    if existing:
        print(f"   ‚è© {out_dir.name}: keyframes already exist ({len(existing)} frames)")
        return

    print(f"   üñºÔ∏è  Extracting keyframes into: {out_dir}")
    # fps=1/every_sec ‚Üí 1 frame every N seconds
    cmd = [
        "ffmpeg",
        "-y",
        "-i", str(video_path),
        "-vf", f"fps=1/{every_sec}",
        "-qscale:v", "2",  # JPEG quality (2 = high)
        str(out_dir / "frame_%06d.jpg"),
    ]
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode != 0:
        print("   ‚ùå ffmpeg failed for", video_path.name)
        print("   STDERR (truncated):", result.stderr[:400])
        raise RuntimeError(f"ffmpeg failed for {video_path}")

# -------------------------------
# 3. Helper: natural sort frames
# -------------------------------
def natural_sort_key(path_obj: Path):
    # Sort frame_000001.jpg, frame_000010.jpg numerically by the number
    s = path_obj.name
    m = re.search(r"(\d+)", s)
    if m:
        return int(m.group(1))
    return s

# -------------------------------
# 4. Main loop: per-video extraction + manifest build
# -------------------------------
manifest_entries = []

for i, rec in enumerate(video_records, start=1):
    video_id = rec["video_id"]
    rel_path = rec["file_path"]
    video_path = PROJECT_ROOT / rel_path

    print(f"\n[{i}/{len(video_records)}] üé• {video_id}")
    print("   Video file:", video_path)

    if not video_path.exists():
        print("   ‚ùå Video missing on Drive, skipping.")
        continue

    # Folder for this video's keyframes
    video_keyframe_dir = KEYFRAMES_ROOT / video_id

    # 4.1 Extract frames (or skip if already there)
    extract_keyframes_for_video(video_path, video_keyframe_dir, KEYFRAME_EVERY_SEC)

    # 4.2 Enumerate frames and create manifest rows
    frames = sorted(video_keyframe_dir.glob("frame_*.jpg"), key=natural_sort_key)
    if not frames:
        print("   ‚ö†Ô∏è No frames found after extraction, continuing.")
        continue

    print(f"   ‚úì Found {len(frames)} keyframes.")

    for idx, frame_path in enumerate(frames, start=1):
        # Approximate timestamp based on sampling interval
        timestamp_sec = (idx - 1) * KEYFRAME_EVERY_SEC

        frame_id = f"{video_id}_frame_{idx:06d}"
        rel_img_path = frame_path.relative_to(PROJECT_ROOT)

        entry = {
            "video_id": video_id,
            "frame_id": frame_id,
            "image_path": str(rel_img_path),
            "index_in_video": idx,
            "approx_timestamp_sec": timestamp_sec,
        }
        manifest_entries.append(entry)

# -------------------------------
# 5. Write keyframes_manifest.jsonl
# -------------------------------
with KEYFRAMES_MANIFEST_PATH.open("w", encoding="utf-8") as f:
    for e in manifest_entries:
        f.write(json.dumps(e) + "\n")

print(f"\nüñºÔ∏è Wrote keyframes manifest: {KEYFRAMES_MANIFEST_PATH}")
print(f"   Total keyframes indexed: {len(manifest_entries)}")
print("\n‚úÖ CELL 6 complete ‚Äî keyframes extracted & indexed.")


üé¨ Loaded 44 videos from manifest.

[1/44] üé• cs229__01_stanford_cs229_machine_learning_full_cou
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/CS229/Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_01_jGwO_UgTS7I.webm
   üñºÔ∏è  Extracting keyframes into: /content/drive/MyDrive/UNISEARCH_MASTER/processed/keyframes/cs229__01_stanford_cs229_machine_learning_full_cou
   ‚úì Found 904 keyframes.

[2/44] üé• cs229__02_stanford_cs229_machine_learning_full_cou
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos/CS229/Stanford CS229Ôºö Machine Learning Full Course taught by Andrew Ng ÔΩú Autumn 2018_02_4b4MUYve_U8.webm
   üñºÔ∏è  Extracting keyframes into: /content/drive/MyDrive/UNISEARCH_MASTER/processed/keyframes/cs229__02_stanford_cs229_machine_learning_full_cou
   ‚úì Found 939 keyframes.

[3/44] üé• cs229__03_stanford_cs229_machine_learning_full_cou
   Video file: /content/drive/MyDrive/UNISEARCH_MASTER/raw/videos

In [11]:
# ==============================================
# CELL 7 ‚Äî Align Keyframes with Transcripts
# ----------------------------------------------
# Uses:
#   - processed/manifests/keyframes_manifest.jsonl
#   - processed/transcripts/{video_id}.json  (from Whisper)
#
# Produces:
#   - processed/manifests/aligned_keyframes_with_snippets.jsonl
#
# For each keyframe (with approx_timestamp_sec = T), we gather transcript
# segments that overlap [T - WINDOW_SEC, T + WINDOW_SEC], concatenate them
# into a readable snippet, and save one JSONL row per keyframe:
#
# {
#   "video_id": "...",
#   "frame_id": "...",
#   "image_path": "processed/keyframes/.../frame_000123.jpg",
#   "approx_timestamp_sec": 180,
#   "transcript_snippet": "Professor explains ...",
#   "segment_ids": [23, 24]
# }
#
# You can safely re-run this cell: it rebuilds the aligned manifest based
# on whatever transcripts exist at the moment (great while Whisper is still running).
# ==============================================

from pathlib import Path
import json
import bisect
import re

PROJECT_ROOT = Path("/content/drive/MyDrive/UNISEARCH_MASTER")
TRANSCRIPTS_DIR = PROJECT_ROOT / "processed" / "transcripts"
MANIFEST_DIR = PROJECT_ROOT / "processed" / "manifests"
KEYFRAMES_MANIFEST_PATH = MANIFEST_DIR / "keyframes_manifest.jsonl"
ALIGNED_OUT_PATH = MANIFEST_DIR / "aligned_keyframes_with_snippets.jsonl"

# Alignment knobs
WINDOW_SEC = 7.5          # gather transcript text from [T - WINDOW_SEC, T + WINDOW_SEC]
SNIPPET_MAX_CHARS = 420   # trim very long snippets for readability
MIN_CHARS_TO_KEEP = 60    # if concatenated text is shorter, keep it as-is

# -------------------------------
# Helpers
# -------------------------------
def load_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def load_jsonl(path: Path):
    data = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data

def clean_spaces(s: str) -> str:
    # collapse whitespace, keep punctuation spacing readable
    s = re.sub(r"\s+", " ", s).strip()
    return s

def build_snippet(segments, t_center: float, window: float):
    """
    Given a video's transcript 'segments' (list of dicts with 'start','end','text'),
    collect text overlapping [t_center - window, t_center + window].
    If nothing overlaps (rare), fall back to the single nearest segment by start time.
    """
    t0, t1 = t_center - window, t_center + window

    selected = []
    for seg in segments:
        s = float(seg.get("start", 0.0))
        e = float(seg.get("end", 0.0))
        if e >= t0 and s <= t1:  # overlap
            selected.append(seg)

    # Fallback: nearest segment by start time (if no overlap)
    if not selected and segments:
        starts = [float(seg.get("start", 0.0)) for seg in segments]
        i = bisect.bisect_left(starts, t_center)
        # pick closest among i-1 and i
        cand = []
        if 0 <= i < len(segments):
            cand.append(segments[i])
        if 0 <= i-1 < len(segments):
            cand.append(segments[i-1])
        # choose the closer start time
        if cand:
            selected = [min(cand, key=lambda s: abs(float(s.get("start", 0.0)) - t_center))]

    if not selected:
        return "", []

    # Concatenate texts in chronological order
    selected = sorted(selected, key=lambda x: float(x.get("start", 0.0)))
    txt = " ".join(clean_spaces(seg.get("text", "")) for seg in selected if seg.get("text"))
    txt = clean_spaces(txt)

    # Trim very long snippets, but avoid cutting mid-sentence harshly
    if len(txt) > SNIPPET_MAX_CHARS:
        # try to cut at a period near the limit for nicer UX
        cut = txt.rfind(".", 0, SNIPPET_MAX_CHARS)
        if cut == -1 or cut < MIN_CHARS_TO_KEEP:
            cut = SNIPPET_MAX_CHARS
        txt = txt[:cut].rstrip() + " ‚Ä¶"

    seg_ids = [seg.get("id") for seg in selected if "id" in seg]
    return txt, seg_ids

# -------------------------------
# Load inputs
# -------------------------------
if not KEYFRAMES_MANIFEST_PATH.exists():
    raise FileNotFoundError(f"‚ùå keyframes_manifest.jsonl not found at {KEYFRAMES_MANIFEST_PATH}")

keyframes = load_jsonl(KEYFRAMES_MANIFEST_PATH)
print(f"üñºÔ∏è Loaded keyframes: {len(keyframes)}")

# Preload transcript index: video_id -> {"segments": [...], "ok": bool}
transcripts = {}
missing_videos = set()

# We'll lazily load transcripts per video_id on demand to keep memory reasonable
def get_transcript(video_id: str):
    if video_id in transcripts:
        return transcripts[video_id]

    t_path = TRANSCRIPTS_DIR / f"{video_id}.json"
    if not t_path.exists():
        transcripts[video_id] = {"segments": [], "ok": False}
        missing_videos.add(video_id)
        return transcripts[video_id]

    data = load_json(t_path)
    segs = data.get("segments", [])
    # ensure numeric times & clean text
    norm = []
    for seg in segs:
        s = float(seg.get("start", 0.0))
        e = float(seg.get("end", s))
        text = clean_spaces(seg.get("text", ""))
        norm.append({"id": seg.get("id"), "start": s, "end": e, "text": text})
    # sort by start (Whisper usually already sorted)
    norm.sort(key=lambda x: x["start"])

    transcripts[video_id] = {"segments": norm, "ok": True}
    return transcripts[video_id]

# -------------------------------
# Align all keyframes
# -------------------------------
aligned_rows = []
count_no_transcript = 0
count_no_snippet = 0

for i, kf in enumerate(keyframes, start=1):
    if i % 2000 == 0:
        print(f"   ‚Ä¶ aligned {i} keyframes so far")

    video_id = kf["video_id"]
    t = float(kf.get("approx_timestamp_sec", 0.0))

    tr = get_transcript(video_id)
    if not tr["ok"]:
        count_no_transcript += 1
        continue

    snippet, seg_ids = build_snippet(tr["segments"], t_center=t, window=WINDOW_SEC)
    if not snippet:
        count_no_snippet += 1
        # We still write the row (without snippet) so downstream can decide to filter or backfill later
        # If you prefer to skip such rows entirely, just "continue" here.

    aligned_rows.append({
        "video_id": video_id,
        "frame_id": kf["frame_id"],
        "image_path": kf["image_path"],
        "approx_timestamp_sec": t,
        "transcript_snippet": snippet,
        "segment_ids": seg_ids,
        "window_sec": WINDOW_SEC
    })

# -------------------------------
# Write output
# -------------------------------
with ALIGNED_OUT_PATH.open("w", encoding="utf-8") as f:
    for row in aligned_rows:
        f.write(json.dumps(row) + "\n")

print(f"\nüß© Wrote aligned manifest: {ALIGNED_OUT_PATH}")
print(f"   Total keyframes processed : {len(keyframes)}")
print(f"   Aligned rows written      : {len(aligned_rows)}")
print(f"   Missing transcripts (kf)  : {count_no_transcript}")
print(f"   No-snippet rows (fallback): {count_no_snippet}")

if missing_videos:
    sample = list(sorted(missing_videos))[:5]
    print(f"\n‚ÑπÔ∏è Transcripts missing for {len(missing_videos)} videos (showing up to 5):")
    for v in sample:
        print("   -", v)
print("\n‚úÖ CELL 7 complete ‚Äî keyframes linked to transcript snippets.")


üñºÔ∏è Loaded keyframes: 33212
   ‚Ä¶ aligned 2000 keyframes so far
   ‚Ä¶ aligned 4000 keyframes so far
   ‚Ä¶ aligned 6000 keyframes so far
   ‚Ä¶ aligned 8000 keyframes so far
   ‚Ä¶ aligned 10000 keyframes so far
   ‚Ä¶ aligned 12000 keyframes so far
   ‚Ä¶ aligned 14000 keyframes so far
   ‚Ä¶ aligned 16000 keyframes so far
   ‚Ä¶ aligned 18000 keyframes so far
   ‚Ä¶ aligned 20000 keyframes so far
   ‚Ä¶ aligned 22000 keyframes so far
   ‚Ä¶ aligned 24000 keyframes so far
   ‚Ä¶ aligned 26000 keyframes so far
   ‚Ä¶ aligned 28000 keyframes so far
   ‚Ä¶ aligned 30000 keyframes so far
   ‚Ä¶ aligned 32000 keyframes so far

üß© Wrote aligned manifest: /content/drive/MyDrive/UNISEARCH_MASTER/processed/manifests/aligned_keyframes_with_snippets.jsonl
   Total keyframes processed : 33212
   Aligned rows written      : 33212
   Missing transcripts (kf)  : 0
   No-snippet rows (fallback): 0

‚úÖ CELL 7 complete ‚Äî keyframes linked to transcript snippets.


In [12]:
# CELL 7.1 ‚Äî Alignment sanity check
from pathlib import Path
import json, itertools

PROJECT_ROOT = Path("/content/drive/MyDrive/UNISEARCH_MASTER")
KF_MANIFEST = PROJECT_ROOT / "processed" / "manifests" / "keyframes_manifest.jsonl"
ALIGNED = PROJECT_ROOT / "processed" / "manifests" / "aligned_keyframes_with_snippets.jsonl"

def load_jsonl(p):
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if line: yield json.loads(line)

kfs_total = sum(1 for _ in load_jsonl(KF_MANIFEST))
aligned = list(load_jsonl(ALIGNED))
with_snip = sum(1 for r in aligned if r.get("transcript_snippet"))
no_snip  = len(aligned) - with_snip

print(f"Keyframes total (from KF manifest): {kfs_total}")
print(f"Aligned rows written              : {len(aligned)}")
print(f" ‚îú‚îÄ with snippet                   : {with_snip}")
print(f" ‚îî‚îÄ without snippet                : {no_snip}")

# show a few sample aligned rows for eyeballing
print("\nSample aligned rows:")
for r in itertools.islice((r for r in aligned if r.get('transcript_snippet')), 3):
    print({
        "video_id": r["video_id"],
        "frame_id": r["frame_id"],
        "t": r["approx_timestamp_sec"],
        "snippet_preview": (r["transcript_snippet"][:140] + "‚Ä¶") if len(r["transcript_snippet"])>140 else r["transcript_snippet"]
    })


Keyframes total (from KF manifest): 33212
Aligned rows written              : 33212
 ‚îú‚îÄ with snippet                   : 33212
 ‚îî‚îÄ without snippet                : 0

Sample aligned rows:
{'video_id': 'cs229__01_stanford_cs229_machine_learning_full_cou', 'frame_id': 'cs229__01_stanford_cs229_machine_learning_full_cou_frame_000001', 't': 0.0, 'snippet_preview': 'Welcome to CS229 Machine Learning. Uh, some of you know that this is a class that taught at Stanford for a long time,'}
{'video_id': 'cs229__01_stanford_cs229_machine_learning_full_cou', 'frame_id': 'cs229__01_stanford_cs229_machine_learning_full_cou_frame_000002', 't': 5.0, 'snippet_preview': 'Welcome to CS229 Machine Learning. Uh, some of you know that this is a class that taught at Stanford for a long time, and this is often the ‚Ä¶'}
{'video_id': 'cs229__01_stanford_cs229_machine_learning_full_cou', 'frame_id': 'cs229__01_stanford_cs229_machine_learning_full_cou_frame_000003', 't': 10.0, 'snippet_preview': 'Welcome t

In [13]:
# ==============================================
# CELL 8 ‚Äî Build Lecture Passages from Transcripts
# ----------------------------------------------
# Uses:
#   - processed/transcripts/{video_id}.json
#   - processed/manifests/video_manifest.jsonl
#
# Produces:
#   - processed/manifests/lecture_passages.jsonl
#
# Design:
#   - Concatenate transcript segments for each video in order
#   - Create overlapping text chunks (by characters) for retrieval
#   - Keep timing: start_sec/end_sec from covered segments
#   - Fields per row:
#       {
#         "source_type": "lecture",
#         "video_id": "...",
#         "course": "CS229",
#         "video_index": 7,
#         "chunk_id": "cs229__07_...__chunk_0001",
#         "start_sec": 120.3,
#         "end_sec": 245.1,
#         "text": "cleaned text ...",
#       }
# ==============================================

from pathlib import Path
import json
import re

PROJECT_ROOT = Path("/content/drive/MyDrive/UNISEARCH_MASTER")
TRANSCRIPTS_DIR = PROJECT_ROOT / "processed" / "transcripts"
MANIFEST_DIR = PROJECT_ROOT / "processed" / "manifests"
VIDEO_MANIFEST_PATH = MANIFEST_DIR / "video_manifest.jsonl"
LECTURE_PASSAGES_PATH = MANIFEST_DIR / "lecture_passages.jsonl"

# Chunking knobs (tune if you like)
CHUNK_CHARS = 1200     # target chunk length (characters)
CHUNK_OVERLAP = 600    # overlap between chunks (characters)
MIN_KEEP_CHARS = 200   # drop tiny crumbs below this

# -------------------------------
# Helpers
# -------------------------------
def load_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def load_jsonl(path: Path):
    out = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if line:
                out.append(json.loads(line))
    return out

def clean_spaces(s: str) -> str:
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\s+\n", "\n", s)
    s = re.sub(r"\n\s+", "\n", s)
    return s.strip()

def merge_segments(segments):
    """
    Return a list of tokens with (text, start, end).
    We'll chunk over the concatenated text but keep per-segment timing,
    so we can compute chunk start/end from covered segments.
    """
    merged = []
    for seg in segments:
        txt = seg.get("text") or ""
        if not txt.strip():
            continue
        merged.append({
            "text": clean_spaces(txt),
            "start": float(seg.get("start", 0.0)),
            "end": float(seg.get("end", 0.0)),
        })
    return merged

def build_chunks(merged_segs, video_prefix):
    """
    Chunk by character length with overlap, but align boundaries to segment edges
    so each chunk's start/end_sec come from covered segments.
    """
    # Build a flat string with segment boundaries recorded
    parts = []
    offsets = []  # [(global_char_start, global_char_end, seg_index)]
    total = 0
    for i, seg in enumerate(merged_segs):
        t = seg["text"]
        if parts:
            parts.append(" ")  # ensure space between segments
            total += 1
        start_off = total
        parts.append(t)
        total += len(t)
        end_off = total
        offsets.append((start_off, end_off, i))
    full_text = "".join(parts)

    chunks = []
    if not full_text.strip():
        return chunks

    L = len(full_text)
    cursor = 0
    chunk_idx = 0

    while cursor < L:
        # target window
        end = min(L, cursor + CHUNK_CHARS)

        # try to end at a sentence boundary near the target end
        window_text = full_text[cursor:end]
        cut = window_text.rfind(". ")
        if cut != -1 and (end - (cursor + cut + 1)) <= 200 and (cut + 1) >= MIN_KEEP_CHARS:
            end = cursor + cut + 1  # cut after period

        # determine covered segments using offsets
        covered = [ix for (s,e,ix) in offsets if not (e <= cursor or s >= end)]
        if not covered:
            # extend slightly to capture at least one segment
            # or break if we really can't (shouldn't happen often)
            # move cursor forward safely
            cursor = min(L, cursor + CHUNK_CHARS - CHUNK_OVERLAP)
            if cursor >= L:
                break
            continue

        seg_start_ix = min(covered)
        seg_end_ix = max(covered)
        start_sec = merged_segs[seg_start_ix]["start"]
        end_sec   = merged_segs[seg_end_ix]["end"]

        chunk_text = clean_spaces(full_text[cursor:end])
        if len(chunk_text) >= MIN_KEEP_CHARS:
            chunk_idx += 1
            chunks.append({
                "chunk_id": f"{video_prefix}__chunk_{chunk_idx:04d}",
                "start_sec": float(start_sec),
                "end_sec": float(end_sec),
                "text": chunk_text
            })

        if end >= L:
            break
        # move cursor forward with overlap
        cursor = max(end - CHUNK_OVERLAP, cursor + 1)

    return chunks

# -------------------------------
# Load video manifest for course/meta
# -------------------------------
if not VIDEO_MANIFEST_PATH.exists():
    raise FileNotFoundError(f"‚ùå Missing {VIDEO_MANIFEST_PATH}")

video_manifest = load_jsonl(VIDEO_MANIFEST_PATH)
video_meta = {}  # video_id -> {"course":..., "index_in_course":...}
for rec in video_manifest:
    video_meta[rec["video_id"]] = {
        "course": rec.get("course"),
        "video_index": rec.get("index_in_course")
    }

# -------------------------------
# Iterate transcripts and create passages
# -------------------------------
written = 0
videos_done = 0
videos_skipped = 0

with LECTURE_PASSAGES_PATH.open("w", encoding="utf-8") as out_f:
    for video_id, meta in video_meta.items():
        t_path = TRANSCRIPTS_DIR / f"{video_id}.json"
        if not t_path.exists():
            videos_skipped += 1
            continue

        data = load_json(t_path)
        segs = data.get("segments", [])
        # normalize + sort segments
        norm = []
        for i, s in enumerate(segs):
            txt = s.get("text") or ""
            if not txt.strip():
                continue
            norm.append({
                "id": s.get("id", i),
                "start": float(s.get("start", 0.0)),
                "end": float(s.get("end", 0.0)),
                "text": clean_spaces(txt),
            })
        norm.sort(key=lambda x: x["start"])

        merged = merge_segments(norm)
        video_prefix = video_id
        chunks = build_chunks(merged, video_prefix)

        for c in chunks:
            row = {
                "source_type": "lecture",
                "video_id": video_id,
                "course": meta.get("course"),
                "video_index": meta.get("video_index"),
                "chunk_id": c["chunk_id"],
                "start_sec": c["start_sec"],
                "end_sec": c["end_sec"],
                "text": c["text"],
            }
            out_f.write(json.dumps(row) + "\n")
            written += 1

        videos_done += 1

print(f"üßæ Wrote lecture passages: {LECTURE_PASSAGES_PATH}")
print(f"   Videos with transcripts processed : {videos_done}")
print(f"   Videos without transcripts        : {videos_skipped}")
print(f"   Total chunks written              : {written}")
print("\n‚úÖ CELL 8 complete ‚Äî lecture passages ready for embeddings.")


üßæ Wrote lecture passages: /content/drive/MyDrive/UNISEARCH_MASTER/processed/manifests/lecture_passages.jsonl
   Videos with transcripts processed : 44
   Videos without transcripts        : 0
   Total chunks written              : 20624

‚úÖ CELL 8 complete ‚Äî lecture passages ready for embeddings.


In [14]:
# ==============================================
# CELL 9 ‚Äî Build Paper Passages from PDFs
# ----------------------------------------------
# Uses:
#   - raw/papers/*.pdf
#   - processed/manifests/paper_manifest.jsonl
#
# Produces:
#   - processed/manifests/paper_passages.jsonl
#
# Each passage row looks like:
# {
#   "source_type": "paper",
#   "paper_id": "...",
#   "file_name": "resnet_2015_deep_residual.pdf",
#   "title": "resnet 2015 deep residual",
#   "chunk_id": "paper_031_resnet_2015_deep_residual__chunk_0001",
#   "page_start": 0,
#   "page_end": 1,
#   "text": "chunk text ..."
# }
#
# Design:
#   - Extract text from each PDF using PyMuPDF.
#   - Concatenate pages but remember page boundaries.
#   - Create overlapping character-based chunks, like we did for lectures.
#   - For each chunk, record which pages it covers.
# ==============================================

from pathlib import Path
import json
import re
import sys
import subprocess

PROJECT_ROOT = Path("/content/drive/MyDrive/UNISEARCH_MASTER")
PAPERS_ROOT = PROJECT_ROOT / "raw" / "papers"
MANIFEST_DIR = PROJECT_ROOT / "processed" / "manifests"
PAPER_MANIFEST_PATH = MANIFEST_DIR / "paper_manifest.jsonl"
PAPER_PASSAGES_PATH = MANIFEST_DIR / "paper_passages.jsonl"

# ---------- Install & import PyMuPDF (fitz) ----------
try:
    import fitz  # PyMuPDF
except ImportError:
    print("üì¶ Installing PyMuPDF (fitz) for PDF parsing...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pymupdf"])
    import fitz


# ---------- Chunking knobs (similar to lectures, maybe slightly larger) ----------
CHUNK_CHARS = 1400      # target chunk length (characters)
CHUNK_OVERLAP = 700     # overlap between chunks
MIN_KEEP_CHARS = 250    # drop tiny chunks below this length


# ---------- Helpers ----------
def load_jsonl(path: Path):
    out = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                out.append(json.loads(line))
    return out


def clean_text(s: str) -> str:
    """
    Light cleanup for PDF text:
      - collapse repeated whitespace
      - remove very long sequences of dots
    """
    s = s.replace("\u00a0", " ")  # non-breaking spaces
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\s+\n", "\n", s)
    s = re.sub(r"\n\s+", "\n", s)
    # collapse crazy dot leaders ("...........") often found in PDFs
    s = re.sub(r"\.{4,}", "...", s)
    return s.strip()


def extract_pages(pdf_path: Path):
    """
    Return list of (page_index, page_text) for a PDF.
    Skips pages that have almost no text.
    """
    doc = fitz.open(pdf_path)
    pages = []
    for i in range(len(doc)):
        page = doc[i]
        txt = page.get_text("text")
        txt = clean_text(txt)
        if len(txt) < 20:
            continue
        pages.append((i, txt))
    doc.close()
    return pages


def build_chunks_from_pages(pages, chunk_chars, overlap, min_keep):
    """
    Given pages = [(page_index, text), ...],
    build overlapping chunks over the concatenated text,
    while remembering which pages each chunk spans.
    """
    if not pages:
        return []

    parts = []
    page_offsets = []  # [(start_off, end_off, page_index)]
    total = 0

    for page_idx, txt in pages:
        if parts:
            parts.append("\n\n")  # separate pages
            total += 2
        start_off = total
        parts.append(txt)
        total += len(txt)
        end_off = total
        page_offsets.append((start_off, end_off, page_idx))

    full_text = "".join(parts)
    full_text = clean_text(full_text)

    if not full_text:
        return []

    chunks = []
    L = len(full_text)
    cursor = 0
    chunk_idx = 0

    while cursor < L:
        target_end = min(L, cursor + chunk_chars)
        window_text = full_text[cursor:target_end]

        # Try to break near a sentence boundary close to target_end
        cut = window_text.rfind(". ")
        if cut != -1 and (target_end - (cursor + cut + 1)) <= 200 and (cut + 1) >= min_keep:
            end = cursor + cut + 1
        else:
            end = target_end

        # Determine which pages this chunk covers
        covered_pages = [pidx for (start, stop, pidx) in page_offsets if not (stop <= cursor or start >= end)]
        if not covered_pages:
            # Move cursor forward and continue
            cursor = min(L, cursor + chunk_chars - overlap)
            if cursor >= L:
                break
            continue

        page_start = min(covered_pages)
        page_end = max(covered_pages)

        chunk_text = clean_text(full_text[cursor:end])
        if len(chunk_text) >= min_keep:
            chunk_idx += 1
            chunks.append({
                "chunk_local_index": chunk_idx,
                "text": chunk_text,
                "page_start": int(page_start),
                "page_end": int(page_end),
            })

        if end >= L:
            break
        cursor = max(end - overlap, cursor + 1)

    return chunks


# ---------- Main: iterate paper manifest & build passages ----------
if not PAPER_MANIFEST_PATH.exists():
    raise FileNotFoundError(f"‚ùå paper_manifest.jsonl not found at {PAPER_MANIFEST_PATH}")

paper_manifest = load_jsonl(PAPER_MANIFEST_PATH)
print(f"üìö Loaded {len(paper_manifest)} papers from manifest.")

total_chunks = 0
papers_done = 0
papers_skipped = 0

with PAPER_PASSAGES_PATH.open("w", encoding="utf-8") as out_f:
    for rec in paper_manifest:
        paper_id = rec["paper_id"]
        file_path = rec["file_path"]   # relative to PROJECT_ROOT
        file_name = rec.get("file_name")
        title_guess = rec.get("title_guess", "")

        pdf_path = PROJECT_ROOT / file_path

        print(f"\nüìÑ [{papers_done + papers_skipped + 1}/{len(paper_manifest)}] {paper_id}")
        print(f"   File: {pdf_path}")

        if not pdf_path.exists():
            print("   ‚ùå Missing PDF on Drive, skipping.")
            papers_skipped += 1
            continue

        # Extract per-page text
        try:
            pages = extract_pages(pdf_path)
        except Exception as e:
            print(f"   ‚ùå Failed to parse PDF: {e}")
            papers_skipped += 1
            continue

        if not pages:
            print("   ‚ö†Ô∏è No usable text extracted, skipping.")
            papers_skipped += 1
            continue

        # Build overlapping chunks
        chunks = build_chunks_from_pages(pages, CHUNK_CHARS, CHUNK_OVERLAP, MIN_KEEP_CHARS)
        print(f"   ‚úì Built {len(chunks)} chunks.")

        for c in chunks:
            chunk_id = f"{paper_id}__chunk_{c['chunk_local_index']:04d}"
            row = {
                "source_type": "paper",
                "paper_id": paper_id,
                "file_name": file_name,
                "title": title_guess,
                "chunk_id": chunk_id,
                "page_start": c["page_start"],
                "page_end": c["page_end"],
                "text": c["text"],
            }
            out_f.write(json.dumps(row) + "\n")
            total_chunks += 1

        papers_done += 1

print(f"\nüßæ Wrote paper passages: {PAPER_PASSAGES_PATH}")
print(f"   Papers processed          : {papers_done}")
print(f"   Papers skipped            : {papers_skipped}")
print(f"   Total paper chunks written: {total_chunks}")
print("\n‚úÖ CELL 9 complete ‚Äî paper passages ready for embeddings.")


üìö Loaded 42 papers from manifest.

üìÑ [1/42] paper_001_adam_2014
   File: /content/drive/MyDrive/UNISEARCH_MASTER/raw/papers/adam_2014.pdf
   ‚úì Built 66 chunks.

üìÑ [2/42] paper_002_albef_2021
   File: /content/drive/MyDrive/UNISEARCH_MASTER/raw/papers/albef_2021.pdf
   ‚úì Built 83 chunks.

üìÑ [3/42] paper_003_albert_2019
   File: /content/drive/MyDrive/UNISEARCH_MASTER/raw/papers/albert_2019.pdf
   ‚úì Built 546 chunks.

üìÑ [4/42] paper_004_alexnet_2012_imagenet
   File: /content/drive/MyDrive/UNISEARCH_MASTER/raw/papers/alexnet_2012_imagenet.pdf
   ‚úì Built 68 chunks.

üìÑ [5/42] paper_005_align_2021
   File: /content/drive/MyDrive/UNISEARCH_MASTER/raw/papers/align_2021.pdf
   ‚úì Built 548 chunks.

üìÑ [6/42] paper_006_attention_is_all_you_need_2017
   File: /content/drive/MyDrive/UNISEARCH_MASTER/raw/papers/attention_is_all_you_need_2017.pdf
   ‚úì Built 512 chunks.

üìÑ [7/42] paper_007_bahdanau_attention_2014
   File: /content/drive/MyDrive/UNISEARCH_MASTER/raw/

In [16]:
# ============================================
# CELL 10 ‚Äî Build ONLY BGE Text Embeddings
# (lecture_passages + paper_passages)
# ============================================

import os
import json
import torch
import numpy as np
from tqdm import tqdm
from pathlib import Path
from sentence_transformers import SentenceTransformer

# ------------------------------------------------
# CONFIG
# ------------------------------------------------
ROOT = Path("/content/drive/MyDrive/UNISEARCH_MASTER")
MANIFEST = ROOT / "processed/manifests"
EMB_ROOT = ROOT / "processed/embeddings"

EMB_ROOT.mkdir(parents=True, exist_ok=True)

LECTURE_PASSAGES = MANIFEST / "lecture_passages.jsonl"
PAPER_PASSAGES = MANIFEST / "paper_passages.jsonl"

# ------------------------------------------------
# LOAD MODEL (BGE-large-en-v1.5)
# ------------------------------------------------
print("üî∑ Loading BGE-large-en-v1.5 (text encoder)...")

bge = SentenceTransformer("BAAI/bge-large-en-v1.5")
bge.max_seq_length = 512
device = "cuda" if torch.cuda.is_available() else "cpu"
bge.to(device)

# ------------------------------------------------
# HELPERS
# ------------------------------------------------
def load_jsonl(path):
    data = []
    with open(path, "r") as f:
        for line in f:
            data.append(json.loads(line))
    return data

def embed_texts(texts, model):
    return model.encode(
        texts,
        convert_to_numpy=True,
        batch_size=32,
        show_progress_bar=True
    )

# ------------------------------------------------
# LOAD PASSAGES
# ------------------------------------------------
print("üìö Loading lecture passages...")
lecture_data = load_jsonl(LECTURE_PASSAGES)

print("üìö Loading paper passages...")
paper_data = load_jsonl(PAPER_PASSAGES)

# ------------------------------------------------
# BUILD BGE TEXT EMBEDDINGS
# ------------------------------------------------
print("\nüî∑ Building BGE text embeddings...")

all_texts = []
all_meta = []

for row in tqdm(lecture_data, desc="Lecture passages"):
    all_texts.append(row["text"])
    all_meta.append({"type": "lecture", **row})

for row in tqdm(paper_data, desc="Paper passages"):
    all_texts.append(row["text"])
    all_meta.append({"type": "paper", **row})

text_embeddings = embed_texts(all_texts, bge)

# ------------------------------------------------
# SAVE
# ------------------------------------------------
np.save(EMB_ROOT / "text_embeddings.npy", text_embeddings)

with open(EMB_ROOT / "text_meta.jsonl", "w") as f:
    for m in all_meta:
        f.write(json.dumps(m) + "\n")

print("\n‚úÖ Saved BGE text embeddings + metadata.")
print("‚úÖ CELL 10 COMPLETE ‚Äî text embeddings ready.")


üî∑ Loading BGE-large-en-v1.5 (text encoder)...
üìö Loading lecture passages...
üìö Loading paper passages...

üî∑ Building BGE text embeddings...


Lecture passages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20624/20624 [00:00<00:00, 938947.18it/s]
Paper passages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17497/17497 [00:00<00:00, 1019684.83it/s]


Batches:   0%|          | 0/1192 [00:00<?, ?it/s]


‚úÖ Saved BGE text embeddings + metadata.
‚úÖ CELL 10 COMPLETE ‚Äî text embeddings ready.


In [18]:
# ============================================
# CELL 11 ‚Äî Build SigLIP Image Embeddings (FIXED)
# (Keyframes ‚Üí Dense Vectors)
#
# - Reads keyframes_manifest.jsonl
# - Uses SigLIP to embed each keyframe image
# - Saves:
#     /processed/embeddings/image_embeddings.npy
#     /processed/embeddings/image_meta.jsonl
# - Resumable: if files already exist, only new frames are added
# - Robust: handles different path field names in manifest
# ============================================

import os
import json
import torch
import numpy as np
from tqdm import tqdm
from pathlib import Path
from PIL import Image

from transformers import AutoProcessor, AutoModel

# -------------------------------
# CONFIG & PATHS
# -------------------------------
ROOT = Path("/content/drive/MyDrive/UNISEARCH_MASTER")
MANIFEST_ROOT = ROOT / "processed" / "manifests"
EMB_ROOT = ROOT / "processed" / "embeddings"

EMB_ROOT.mkdir(parents=True, exist_ok=True)

KEYFRAMES_MANIFEST = MANIFEST_ROOT / "keyframes_manifest.jsonl"

IMG_EMB_PATH = EMB_ROOT / "image_embeddings.npy"
IMG_META_PATH = EMB_ROOT / "image_meta.jsonl"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üíª Using device: {device}")

# -------------------------------
# LOAD SIGLIP MODEL
# -------------------------------
print("üñºÔ∏è Loading SigLIP model (google/siglip-base-patch16-384)...")
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-384")
model = AutoModel.from_pretrained("google/siglip-base-patch16-384")
model.to(device)
model.eval()

# -------------------------------
# UTILS
# -------------------------------
def load_jsonl(path):
    data = []
    with open(path, "r") as f:
        for line in f:
            data.append(json.loads(line))
    return data

def load_existing_image_embeddings():
    """
    If image_embeddings.npy + image_meta.jsonl already exist,
    load them so we can resume instead of recomputing everything.
    """
    if not (IMG_EMB_PATH.exists() and IMG_META_PATH.exists()):
        return None, []

    print(f"üîÅ Found existing image embeddings at: {IMG_EMB_PATH}")
    emb = np.load(IMG_EMB_PATH)

    meta = []
    with open(IMG_META_PATH, "r") as f:
        for line in f:
            meta.append(json.loads(line))

    if emb.shape[0] != len(meta):
        print("‚ö†Ô∏è WARNING: embedding rows != meta rows. "
              "You may want to delete and recompute.")
    else:
        print(f"   Loaded {emb.shape[0]} existing image embeddings.")

    return emb, meta

def get_image_rel_path(row):
    """
    Try to determine the relative image path from a manifest row.
    Handles multiple possible key names and falls back to a constructed path.
    """
    # 1) Direct fields if they exist
    for key in ["file_path", "image_path", "frame_path"]:
        if key in row:
            return row[key]

    # 2) Fallback: construct from video_id + frame_id.
    #    This matches the typical pattern:
    #    processed/keyframes/{video_id}/{frame_id}.jpg
    video_id = row.get("video_id")
    frame_id = row.get("frame_id")
    if video_id is None or frame_id is None:
        raise KeyError(
            "Cannot determine image path: no file_path/image_path/frame_path "
            "and missing video_id/frame_id in row."
        )

    # If your keyframes are .png instead of .jpg, change this extension.
    return f"processed/keyframes/{video_id}/{frame_id}.jpg"

def embed_image_batch(paths):
    """
    Given a list of image paths, load them, run through SigLIP,
    and return a numpy array of shape (batch_size, dim) plus
    the list of successfully processed paths.
    """
    images = []
    for p in paths:
        try:
            img = Image.open(p).convert("RGB")
            images.append(img)
        except Exception as e:
            print(f"   ‚ö†Ô∏è Failed to open image: {p} ({e})")
            images.append(None)

    valid_indices = [i for i, img in enumerate(images) if img is not None]
    if not valid_indices:
        return None, []

    valid_images = [images[i] for i in valid_indices]
    valid_paths  = [paths[i] for i in valid_indices]

    inputs = processor(images=valid_images, return_tensors="pt").to(device)
    with torch.no_grad():
        feats = model.get_image_features(**inputs)  # (B, D)
    feats = feats.cpu().numpy()

    # L2 normalize for cosine similarity
    norms = np.linalg.norm(feats, axis=1, keepdims=True) + 1e-12
    feats = feats / norms

    return feats, valid_paths

# -------------------------------
# LOAD KEYFRAME METADATA
# -------------------------------
print(f"üìÑ Loading keyframe manifest from: {KEYFRAMES_MANIFEST}")
keyframe_rows = load_jsonl(KEYFRAMES_MANIFEST)
print(f"   Total keyframes listed: {len(keyframe_rows)}")

# -------------------------------
# RESUME SUPPORT
# -------------------------------
existing_emb, existing_meta = load_existing_image_embeddings()
processed_ids = set()

if existing_meta:
    processed_ids = {m["frame_id"] for m in existing_meta if "frame_id" in m}
    print(f"   Already processed frame_ids: {len(processed_ids)}")
else:
    print("   No existing image embeddings found ‚Äî starting fresh.")

# -------------------------------
# BUILD NEW EMBEDDINGS
# -------------------------------
BATCH_SIZE = 64
new_emb_list = []
new_meta_list = []

batch_paths = []
batch_meta = []

print("\nüöÄ Embedding keyframes with SigLIP...")

for row in tqdm(keyframe_rows, desc="Keyframes"):
    frame_id = row.get("frame_id")

    # If we have frame_id and it's already processed, skip
    if frame_id is not None and frame_id in processed_ids:
        continue

    try:
        rel_path = get_image_rel_path(row)
    except KeyError as e:
        print(f"   ‚ö†Ô∏è Skipping row due to missing path info: {e}")
        continue

    img_path = ROOT / rel_path
    batch_paths.append(img_path)
    batch_meta.append(row)

    if len(batch_paths) >= BATCH_SIZE:
        feats, valid_paths = embed_image_batch(batch_paths)

        if feats is not None:
            path_to_feat_idx = {str(p): i for i, p in enumerate(valid_paths)}
            for meta_row, p in zip(batch_meta, batch_paths):
                p_str = str(p)
                if p_str in path_to_feat_idx:
                    idx = path_to_feat_idx[p_str]
                    new_emb_list.append(feats[idx])
                    new_meta_list.append(meta_row)

        batch_paths = []
        batch_meta = []

# Process any leftover images in the final batch
if batch_paths:
    feats, valid_paths = embed_image_batch(batch_paths)
    if feats is not None:
        path_to_feat_idx = {str(p): i for i, p in enumerate(valid_paths)}
        for meta_row, p in zip(batch_meta, batch_paths):
            p_str = str(p)
            if p_str in path_to_feat_idx:
                idx = path_to_feat_idx[p_str]
                new_emb_list.append(feats[idx])
                new_meta_list.append(meta_row)

# -------------------------------
# COMBINE WITH EXISTING (IF ANY)
# -------------------------------
if new_emb_list:
    new_emb = np.vstack(new_emb_list)
    print(f"\nüßÆ New embeddings computed this run: {new_emb.shape[0]}")

    if existing_emb is not None and existing_emb.shape[0] > 0:
        combined_emb = np.vstack([existing_emb, new_emb])
        combined_meta = existing_meta + new_meta_list
    else:
        combined_emb = new_emb
        combined_meta = new_meta_list
else:
    print("\n‚ÑπÔ∏è No new keyframes needed embedding (everything was already done).")
    combined_emb = existing_emb
    combined_meta = existing_meta

# -------------------------------
# SAVE TO DISK
# -------------------------------
if combined_emb is not None and len(combined_meta) > 0:
    np.save(IMG_EMB_PATH, combined_emb)

    with open(IMG_META_PATH, "w") as f:
        for m in combined_meta:
            f.write(json.dumps(m) + "\n")

    print(f"\n‚úÖ Saved image embeddings to: {IMG_EMB_PATH}")
    print(f"‚úÖ Saved image metadata to : {IMG_META_PATH}")
    print(f"   Total image embeddings: {combined_emb.shape[0]}")
else:
    print("\n‚ö†Ô∏è No image embeddings to save (something might be wrong).")

print("\n‚úÖ CELL 11 COMPLETE ‚Äî SigLIP image embeddings ready.")


üíª Using device: cuda
üñºÔ∏è Loading SigLIP model (google/siglip-base-patch16-384)...
üìÑ Loading keyframe manifest from: /content/drive/MyDrive/UNISEARCH_MASTER/processed/manifests/keyframes_manifest.jsonl
   Total keyframes listed: 33212
   No existing image embeddings found ‚Äî starting fresh.

üöÄ Embedding keyframes with SigLIP...


Keyframes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 33212/33212 [27:29<00:00, 20.13it/s]



üßÆ New embeddings computed this run: 33212

‚úÖ Saved image embeddings to: /content/drive/MyDrive/UNISEARCH_MASTER/processed/embeddings/image_embeddings.npy
‚úÖ Saved image metadata to : /content/drive/MyDrive/UNISEARCH_MASTER/processed/embeddings/image_meta.jsonl
   Total image embeddings: 33212

‚úÖ CELL 11 COMPLETE ‚Äî SigLIP image embeddings ready.


In [20]:
# === CELL 12: Build FAISS IVF indices for text (BGE) and images (SigLIP) ========
# This cell:
#   1. Loads BGE text embeddings (lectures + papers) + their metadata.
#   2. Loads SigLIP image embeddings (keyframes) + their metadata.
#   3. Builds FAISS IndexIVFFlat (inverted file index, inner product) for both.
#   4. Saves the indices to Drive for fast ANN (approximate nearest neighbor) search.
#
# Assumed files (from previous cells):
#   - /processed/embeddings/text_embeddings_bge.npy
#   - /processed/embeddings/text_meta_bge.jsonl
#   - /processed/embeddings/image_embeddings.npy
#   - /processed/embeddings/image_meta.jsonl
#
# NOTE:
#   - We assume embeddings are already L2-normalized (so IP ‚âà cosine).
#   - IVF is approximate but much faster than flat search, and more scalable.

import os
import json
from pathlib import Path

import numpy as np

# --- Try to import FAISS, install if missing (Colab-friendly) -------------------
try:
    import faiss
except ImportError:
    print("üì¶ faiss not found, installing faiss-cpu...")
    %pip install -q faiss-cpu
    import faiss

# --- Paths ----------------------------------------------------------------------
PROJECT_ROOT   = Path("/content/drive/MyDrive/UNISEARCH_MASTER")
PROCESSED_ROOT = PROJECT_ROOT / "processed"
EMB_ROOT       = PROCESSED_ROOT / "embeddings"
INDICES_ROOT   = PROCESSED_ROOT / "indices"

INDICES_ROOT.mkdir(parents=True, exist_ok=True)

# Text (BGE) embeddings ‚Äî use the ones from your original Cell 10
TEXT_EMB_PATH   = EMB_ROOT / "text_embeddings.npy"
TEXT_META_PATH  = EMB_ROOT / "text_meta.jsonl"

INDEX_TEXT_PATH = INDICES_ROOT / "index_text_bge_ivf.faiss"

# Image (SigLIP) embeddings
IMG_EMB_PATH     = EMB_ROOT / "image_embeddings.npy"
IMG_META_PATH    = EMB_ROOT / "image_meta.jsonl"
INDEX_IMAGE_PATH = INDICES_ROOT / "index_image_siglip_ivf.faiss"

print(f"üìÅ EMBEDDINGS ROOT: {EMB_ROOT}")
print(f"üìÅ INDICES ROOT   : {INDICES_ROOT}")

# Helper: choose a reasonable nlist (number of IVF clusters)
def choose_nlist(n_vectors: int) -> int:
    # Heuristic: about sqrt(N), clipped into [64, 4096]
    import math
    nlist = int(math.sqrt(max(1, n_vectors)))
    nlist = max(64, min(4096, nlist))
    return nlist

# === 1) Build / load TEXT IVF index (BGE) ======================================

if not TEXT_EMB_PATH.exists():
    raise FileNotFoundError(
        f"‚ùå BGE text embeddings not found at: {TEXT_EMB_PATH}\n"
        "Make sure the BGE embeddings cell ran successfully."
    )

print("\nüìö Loading BGE text embeddings...")
text_embs = np.load(TEXT_EMB_PATH)  # shape: [N_text, dim]
num_text, dim_text = text_embs.shape
print(f"   ‚Üí Loaded {num_text} text embeddings with dim={dim_text}")

if not TEXT_META_PATH.exists():
    raise FileNotFoundError(
        f"‚ùå Text meta file not found at: {TEXT_META_PATH}\n"
        "It should have been written alongside the BGE embeddings."
    )

with TEXT_META_PATH.open("r", encoding="utf-8") as f:
    meta_lines = sum(1 for _ in f)
if meta_lines != num_text:
    print(
        f"‚ö†Ô∏è WARNING: text_meta_bge.jsonl line count ({meta_lines}) "
        f"!= embeddings rows ({num_text}). "
        "Index will still be built, but check your pipeline consistency."
    )

if INDEX_TEXT_PATH.exists():
    print(f"\nüì¶ Existing BGE IVF text index found at: {INDEX_TEXT_PATH}")
    index_text = faiss.read_index(str(INDEX_TEXT_PATH))
    if index_text.ntotal != num_text:
        print(
            f"‚ö†Ô∏è Index contains {index_text.ntotal} vectors, "
            f"but we have {num_text} text embeddings."
        )
        print("   If this is stale, delete the .faiss file and re-run this cell.")
    else:
        print("   ‚úì Text IVF index loaded and matches embedding count.")
else:
    print("\nüßÆ Building new FAISS IVF index for BGE text embeddings...")

    # L2 normalize again just in case (cheap and safe)
    text_embs = text_embs.astype("float32")
    norms = np.linalg.norm(text_embs, axis=1, keepdims=True) + 1e-10
    text_embs = text_embs / norms

    nlist_text = choose_nlist(num_text)
    print(f"   ‚Üí Using nlist (clusters) for text: {nlist_text}")

    # Quantizer for IVF: flat inner-product index
    quantizer_text = faiss.IndexFlatIP(dim_text)
    index_text = faiss.IndexIVFFlat(quantizer_text, dim_text, nlist_text, faiss.METRIC_INNER_PRODUCT)

    # Train IVF on the text embeddings
    print("   ‚Üí Training text IVF index...")
    index_text.train(text_embs)
    print("   ‚úì Training complete.")

    # Add all text vectors
    index_text.add(text_embs)
    print(f"   ‚Üí IVF text index built with {index_text.ntotal} vectors.")

    faiss.write_index(index_text, str(INDEX_TEXT_PATH))
    print(f"‚úÖ Saved text IVF index to: {INDEX_TEXT_PATH}")

# === 2) Build / load IMAGE IVF index (SigLIP) ==================================

if not IMG_EMB_PATH.exists():
    raise FileNotFoundError(
        f"\n‚ùå SigLIP image embeddings not found at: {IMG_EMB_PATH}\n"
        "Make sure the SigLIP keyframe embedding cell ran successfully."
    )

print("\nüñºÔ∏è Loading SigLIP image embeddings...")
img_embs = np.load(IMG_EMB_PATH)  # shape: [N_img, dim]
num_img, dim_img = img_embs.shape
print(f"   ‚Üí Loaded {num_img} image embeddings with dim={dim_img}")

if not IMG_META_PATH.exists():
    raise FileNotFoundError(
        f"‚ùå Image meta file not found at: {IMG_META_PATH}\n"
        "It should have been written in the SigLIP cell."
    )

with IMG_META_PATH.open("r", encoding="utf-8") as f:
    img_meta_lines = sum(1 for _ in f)
if img_meta_lines != num_img:
    print(
        f"‚ö†Ô∏è WARNING: image_meta.jsonl line count ({img_meta_lines}) "
        f"!= embeddings rows ({num_img}). "
        "Index will still be built, but check your pipeline consistency."
    )

if INDEX_IMAGE_PATH.exists():
    print(f"\nüì¶ Existing SigLIP IVF image index found at: {INDEX_IMAGE_PATH}")
    index_img = faiss.read_index(str(INDEX_IMAGE_PATH))
    if index_img.ntotal != num_img:
        print(
            f"‚ö†Ô∏è Index contains {index_img.ntotal} vectors, "
            f"but we have {num_img} image embeddings."
        )
        print("   If this is stale, delete the .faiss file and re-run this cell.")
    else:
        print("   ‚úì Image IVF index loaded and matches embedding count.")
else:
    print("\nüßÆ Building new FAISS IVF index for SigLIP image embeddings...")

    img_embs = img_embs.astype("float32")
    norms = np.linalg.norm(img_embs, axis=1, keepdims=True) + 1e-10
    img_embs = img_embs / norms

    nlist_img = choose_nlist(num_img)
    print(f"   ‚Üí Using nlist (clusters) for images: {nlist_img}")

    quantizer_img = faiss.IndexFlatIP(dim_img)
    index_img = faiss.IndexIVFFlat(quantizer_img, dim_img, nlist_img, faiss.METRIC_INNER_PRODUCT)

    print("   ‚Üí Training image IVF index...")
    index_img.train(img_embs)
    print("   ‚úì Training complete.")

    index_img.add(img_embs)
    print(f"   ‚Üí IVF image index built with {index_img.ntotal} vectors.")

    faiss.write_index(index_img, str(INDEX_IMAGE_PATH))
    print(f"‚úÖ Saved image IVF index to: {INDEX_IMAGE_PATH}")

print("\n‚úÖ CELL 12 COMPLETE ‚Äî IVF FAISS indices for text (BGE) and images (SigLIP) are ready.")


üìÅ EMBEDDINGS ROOT: /content/drive/MyDrive/UNISEARCH_MASTER/processed/embeddings
üìÅ INDICES ROOT   : /content/drive/MyDrive/UNISEARCH_MASTER/processed/indices

üìö Loading BGE text embeddings...
   ‚Üí Loaded 38121 text embeddings with dim=1024

üßÆ Building new FAISS IVF index for BGE text embeddings...
   ‚Üí Using nlist (clusters) for text: 195
   ‚Üí Training text IVF index...
   ‚úì Training complete.
   ‚Üí IVF text index built with 38121 vectors.
‚úÖ Saved text IVF index to: /content/drive/MyDrive/UNISEARCH_MASTER/processed/indices/index_text_bge_ivf.faiss

üñºÔ∏è Loading SigLIP image embeddings...
   ‚Üí Loaded 33212 image embeddings with dim=768

üßÆ Building new FAISS IVF index for SigLIP image embeddings...
   ‚Üí Using nlist (clusters) for images: 182
   ‚Üí Training image IVF index...
   ‚úì Training complete.
   ‚Üí IVF image index built with 33212 vectors.
‚úÖ Saved image IVF index to: /content/drive/MyDrive/UNISEARCH_MASTER/processed/indices/index_image_siglip_i