In [1]:
# --- Self-contained profiler for YT-RAG indexing ---
import os, time, json, shutil, hashlib
from pathlib import Path
from typing import List, Dict, Any
import numpy as np

# Embeddings / Vector store (LangChain community)
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import FAISS as LCFAISS
from langchain.docstore.document import Document

# Transcript API
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled

# Try to import faiss (CPU or GPU)
try:
    import faiss  # faiss-cpu or faiss-gpu
except Exception as e:
    raise RuntimeError(
        "FAISS is required. Install with `pip install faiss-cpu` (or faiss-gpu on Linux)."
    ) from e

# -------------------- Config --------------------
CACHE_DIR = Path(".yt_rag_cache")
TARGET_WINDOW_SECONDS = 75  # ~60‚Äì90s is good for long videos

# -------------------- Helpers -------------------
def _vid_dir(video_id: str) -> Path:
    return CACHE_DIR / video_id

def _ts(sec: float) -> str:
    sec = int(sec)
    h = sec // 3600
    m = (sec % 3600) // 60
    s = sec % 60
    return f"{h:d}:{m:02d}:{s:02d}" if h else f"{m:d}:{s:02d}"

def _dir_size_bytes(p: Path) -> int:
    if not p.exists():
        return 0
    total = 0
    for child in p.rglob("*"):
        if child.is_file():
            try:
                total += child.stat().st_size
            except FileNotFoundError:
                pass
    return total

def fetch_transcript(video_id: str, languages: List[str] = ["en"]) -> List[Dict[str, Any]]:
    """Fetch raw transcript segments (start, duration, text)."""
    try:
        t0 = time.time()
        api = YouTubeTranscriptApi()
        fetched = api.fetch(video_id, languages=languages)
        data = fetched.to_raw_data()
        print(f"‚úÖ Transcript fetched: {len(data)} segments in {time.time()-t0:.2f}s")
        return data
    except TranscriptsDisabled:
        raise RuntimeError("No captions available for this video.")
    except Exception as e:
        raise RuntimeError(f"Failed to fetch transcript: {e}")

def group_segments(segments: List[Dict[str, Any]], target_window_s: int = TARGET_WINDOW_SECONDS) -> List[Dict[str, Any]]:
    """Greedy time-based grouping to ~target_window_s windows."""
    out = []
    cur, start, end = [], None, None
    for row in segments:
        t0 = row["start"]
        t1 = row["start"] + row.get("duration", 0)
        if start is None:
            start = t0
        end = t1
        cur.append(row["text"].strip())
        if (end - start) >= target_window_s:
            out.append({"start": start, "end": end, "text": " ".join(cur).strip()})
            cur, start, end = [], None, None
    if cur:
        out.append({"start": start, "end": end, "text": " ".join(cur).strip()})
    print(f"üß© Windows created: {len(out)} (‚âà{target_window_s}s each)")
    return out

def make_docs(video_id: str, windows: List[Dict[str, Any]]) -> List[Document]:
    docs = []
    for i, w in enumerate(windows):
        meta = {"video_id": video_id, "start": w["start"], "end": w["end"], "window_id": i}
        docs.append(Document(page_content=w["text"], metadata=meta))
    return docs

# -------------------- Profiler -------------------
def profile_build_index(video_id: str, time_langchain_save: bool = True):
    vdir = _vid_dir(video_id)
    idx_dir = vdir / "faiss_prof"
    lc_dir  = idx_dir / "lc"              # LangChain save_local target
    meta_fp = vdir / "meta_prof.json"

    # Clean any old run safely
    if idx_dir.exists():
        shutil.rmtree(idx_dir, ignore_errors=True)

    # Ensure parent dirs exist (creates .yt_rag_cache and video subfolder)
    idx_dir.mkdir(parents=True, exist_ok=True)

    print("üîé Profiling fresh index build‚Ä¶")
    t0 = time.time()

    # ---- Phase: transcript ‚Üí windows ‚Üí docs
    t = time.time(); segs = fetch_transcript(video_id); t_fetch = time.time() - t
    t = time.time(); wins = group_segments(segs);        t_group = time.time() - t
    t = time.time(); docs = make_docs(video_id, wins);   t_docs  = time.time() - t

    # ---- Phase: embeddings (timed explicitly)
    t = time.time()
    embeddings = FastEmbedEmbeddings()
    texts = [d.page_content for d in docs]
    vecs = embeddings.embed_documents(texts) if texts else []
    t_embed_only = time.time() - t

    # ---- Phase: FAISS add (native, timed)
    t = time.time()
    arr = np.array(vecs, dtype="float32") if len(vecs) else np.zeros((0, 384), dtype="float32")
    if arr.ndim != 2:
        # Ensure correct shape even for a single vector
        arr = np.atleast_2d(arr).astype("float32")
    d = arr.shape[1] if arr.size else 384  # default dim guess if empty
    index = faiss.IndexFlatIP(d)
    if arr.size:
        index.add(arr)
    t_faiss_add = time.time() - t

    # ---- Phase: Save (native faiss + simple metadata)
    t = time.time()
    idx_bin = idx_dir / "index.faiss"
    faiss.write_index(index, str(idx_bin))
    meta_out = idx_dir / "metas.jsonl"
    with meta_out.open("w", encoding="utf-8") as f:
        for ddoc in docs:
            f.write(json.dumps(ddoc.metadata, ensure_ascii=False) + "\n")
    t_save_native = time.time() - t

    # ---- Phase: Also time LangChain's save_local (optional; won't crash if unsupported)
    t_save_lc = 0.0
    if time_langchain_save:
        t = time.time()
        try:
            lc_dir.mkdir(parents=True, exist_ok=True)
            # Not all LC versions have FAISS.from_embeddings. We'll try two paths:
            try:
                # Newer LC path: create LC FAISS from precomputed embeddings
                vs = LCFAISS.from_embeddings(
                    embeddings=arr,
                    embedding=embeddings,
                    metadatas=[d.metadata for d in docs],
                    texts=[d.page_content for d in docs],
                )
            except Exception:
                # Fallback: manual constructor without re-embedding
                from langchain.docstore.in_memory import InMemoryDocstore
                from langchain.docstore.document import Document as LCDocument
                import uuid
                ids = [str(uuid.uuid4()) for _ in docs]
                lc_docs = {i: LCDocument(page_content=d.page_content, metadata=d.metadata) for i, d in zip(ids, docs)}
                docstore = InMemoryDocstore(lc_docs)
                index2 = faiss.IndexFlatIP(d)
                if arr.size:
                    index2.add(arr)
                vs = LCFAISS(
                    embedding_function=embeddings,
                    index=index2,
                    docstore=docstore,
                    index_to_docstore_id={i: id_ for i, id_ in enumerate(ids)},
                )

            LCFAISS.save_local(vs, str(lc_dir))
            t_save_lc = time.time() - t
        except Exception as e:
            print(f"‚ÑπÔ∏è Skipping LangChain save_local timing (not supported in this LC version): {e}")
            t_save_lc = 0.0

    # ---- Sizes
    size_native = idx_bin.stat().st_size if idx_bin.exists() else 0
    size_lc = _dir_size_bytes(lc_dir)

    total = time.time() - t0

    print(f"\n‚è±Ô∏è  Timings (s):"
          f"\n  transcript fetch     : {t_fetch:.2f}"
          f"\n  window grouping      : {t_group:.2f}"
          f"\n  docs creation        : {t_docs:.2f}"
          f"\n  EMBEDDING (only)     : {t_embed_only:.2f}"
          f"\n  FAISS add (flat)     : {t_faiss_add:.3f}"
          f"\n  Save native faiss    : {t_save_native:.2f}"
          f"\n  Save LangChain local : {t_save_lc:.2f}"
          f"\n  ----------------------------"
          f"\n  TOTAL (this profiler): {total:.2f}")

    print(f"\nüì¶ Sizes:"
          f"\n  native index.faiss   : {size_native/1e6:.2f} MB"
          f"\n  LC save_local folder : {size_lc/1e6:.2f} MB")

    # Persist a tiny meta summary (optional)
    try:
        vdir.mkdir(parents=True, exist_ok=True)
        with (meta_fp).open("w", encoding="utf-8") as f:
            json.dump({
                "video_id": video_id,
                "num_windows": len(wins),
                "dim": int(arr.shape[1]) if arr.size else 0,
                "native_index_path": str(idx_dir / "index.faiss"),
                "lc_dir": str(lc_dir),
                "timings": {
                    "t_fetch": t_fetch,
                    "t_group": t_group,
                    "t_docs": t_docs,
                    "t_embed_only": t_embed_only,
                    "t_faiss_add": t_faiss_add,
                    "t_save_native": t_save_native,
                    "t_save_lc": t_save_lc,
                    "total": total
                }
            }, f, ensure_ascii=False, indent=2)
    except Exception:
        pass

    return {
        "t_fetch": t_fetch,
        "t_group": t_group,
        "t_docs": t_docs,
        "t_embed_only": t_embed_only,
        "t_faiss_add": t_faiss_add,
        "t_save_native": t_save_native,
        "t_save_lc": t_save_lc,
        "total": total,
        "size_native": size_native,
        "size_lc": size_lc
    }

# -------- Example run --------
# You can change the video id if you like.
stats = profile_build_index("3qHkcs3kG44", time_langchain_save=True)
print("\nReturned stats:", stats)


üîé Profiling fresh index build‚Ä¶
‚úÖ Transcript fetched: 4076 segments in 2.76s
üß© Windows created: 108 (‚âà75s each)

‚è±Ô∏è  Timings (s):
  transcript fetch     : 2.77
  window grouping      : 0.00
  docs creation        : 0.00
  EMBEDDING (only)     : 48.09
  FAISS add (flat)     : 0.034
  Save native faiss    : 0.01
  Save LangChain local : 0.07
  ----------------------------
  TOTAL (this profiler): 50.97

üì¶ Sizes:
  native index.faiss   : 0.17 MB
  LC save_local folder : 0.32 MB

Returned stats: {'t_fetch': 2.7651867866516113, 't_group': 0.0010025501251220703, 't_docs': 0.0010073184967041016, 't_embed_only': 48.09089159965515, 't_faiss_add': 0.03351998329162598, 't_save_native': 0.01379251480102539, 't_save_lc': 0.06844496726989746, 'total': 50.97384572029114, 'size_native': 165933, 'size_lc': 324431}


---