# YouTube RAG Pipeline

Pipeline
1. **Loading Data**
2. **Chunking**
3. **Embeddings** (OpenAI-Embeddings)
4. **VectorDB** (Chroma)
5. **Retriever**
6. **LLM** (OpenAI Chat-Model)
7. **Chain** (Conversational Retrieval)
8. **Memory**


## 0. YouTube videos / transcripts

Imports

In [None]:
!pip uninstall -y youtube-transcript-api
!pip install youtube-transcript-api==0.6.2

In [None]:
import os
from google.colab import userdata  # this is what reads Colab secrets

# Get the key from Colab Secrets
openai_key = userdata.get("OPENAI_API_KEY")

if openai_key is None:
    raise ValueError("OPENAI_API_KEY not found in Colab secrets. Check the name.")

# Option A: set as environment variable so LangChain & others can use it
os.environ["OPENAI_API_KEY"] = openai_key

print("Loaded OPENAI_API_KEY from Colab secrets:", os.environ["OPENAI_API_KEY"] is not None)


In [None]:
!pip install youtube-transcript-api chromadb sentence-transformers transformers accelerate torch"""

In [None]:
from urllib.parse import urlparse, parse_qs

from youtube_transcript_api import (
    YouTubeTranscriptApi,
    TranscriptsDisabled,
    NoTranscriptFound,
)

import pandas as pd
import chromadb
from chromadb.utils import embedding_functions

YouTube ingestion
- URL -> video_id
- video_id -> transcript (list)
- transcript -> plain text

In [None]:
def extract_video_id(url: str) -> str:
    """Extract YouTube video ID from normal or short URLs."""
    parsed = urlparse(url)

    # Short youtu.be links
    if parsed.netloc in ("youtu.be", "www.youtu.be"):
        return parsed.path.lstrip("/")

    # Regular youtube.com links
    if parsed.netloc in ("www.youtube.com", "youtube.com", "m.youtube.com"):
        qs = parse_qs(parsed.query)
        vid = qs.get("v", [None])[0]
        if vid:
            return vid

    raise ValueError(f"Could not extract video_id from URL: {url}")


def transcript_to_text(transcript, include_timestamps: bool = False) -> str:
    """
    Convert a list of segments from youtube_transcript_api into plain text.
    transcript: [{'text': '...', 'start': 0.0, 'duration': 3.2}, ...]
    """
    lines = []
    for seg in transcript:
        text = seg.get("text", "")
        if not text:
            continue

        if include_timestamps:
            start = seg.get("start", 0)
            minutes = int(start // 60)
            seconds = int(start % 60)
            timestamp = f"[{minutes:02d}:{seconds:02d}] "
            lines.append(timestamp + text)
        else:
            lines.append(text)

    return " ".join(lines)


def fetch_transcript_text(
    video_id: str,
    preferred_languages=None,
    include_timestamps: bool = False,
) -> str:
    """
    Try to fetch a transcript:
    1) in preferred_languages (if provided)
    2) otherwise in English-ish defaults
    3) if that fails, fall back to ANY available transcript
    """
    if preferred_languages is None:
        preferred_languages = ["en", "en-US", "en-GB"]

    # Normalize to list of str
    if isinstance(preferred_languages, str):
        preferred_languages = [
            lang.strip() for lang in preferred_languages.split(",") if lang.strip()
        ]

    try:
        # First: try preferred languages
        try:
            transcript = YouTubeTranscriptApi.get_transcript(
                video_id,
                languages=preferred_languages,
            )
            return transcript_to_text(transcript, include_timestamps=include_timestamps)

        except NoTranscriptFound:
            print(
                f"  No transcript in preferred languages {preferred_languages} "
                f"for {video_id}. Trying any available transcript..."
            )

            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
            # just pick the first available transcript
            t = next(iter(transcript_list))
            transcript = t.fetch()
            return transcript_to_text(transcript, include_timestamps=include_timestamps)

    except TranscriptsDisabled:
        raise RuntimeError(f"Transcripts are DISABLED for video_id={video_id}")
    except NoTranscriptFound:
        raise RuntimeError(
            f"No transcript found at all for video_id={video_id} "
            f"(even after fallback)."
        )
    except Exception as e:
        raise RuntimeError(f"Error fetching transcript for {video_id}: {e}")


def ingest_youtube_videos(
    urls,
    languages=None,  # passed to preferred_languages
) -> pd.DataFrame:
    """
    For each URL:
    - extract video_id
    - fetch transcript
    - store in DataFrame
    """
    rows = []

    for url in urls:
        print(f"\n=== Processing URL: {url} ===")
        try:
            video_id = extract_video_id(url)
            print(f"  video_id: {video_id}")

            transcript = fetch_transcript_text(
                video_id,
                preferred_languages=languages,
            )
            print(f"  ✅ Transcript length: {len(transcript)} characters")

            rows.append(
                {
                    "video_id": video_id,
                    "url": url,
                    "transcript": transcript,
                }
            )

        except Exception as e:
            print(f"  ⚠️ Skipping {url}: {e}")

    df = pd.DataFrame(rows)
    print(f"\nIngested {len(df)} videos out of {len(urls)} URLs.")
    return df

Store transcripts in Chroma

In [None]:
# Create a Chroma collection from a DataFrame with columns 'video_id', 'url', 'transcript'
def build_chroma_collection_from_df(
    df: pd.DataFrame,
    collection_name: str = "youtube_videos",
):
    # 2.1 Set up Chroma client (in-memory for now; for persistence use PersistentClient)
    client = chromadb.Client()

    # 2.2 Define an embedding function (SentenceTransformer)
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )

    # 2.3 Create (or recreate) the collection
    existing = [c.name for c in client.list_collections()]
    if collection_name in existing:
        client.delete_collection(collection_name)

    collection = client.create_collection(
        name=collection_name,
        embedding_function=embedding_func,
    )

    # 2.4 Add documents to collection
    documents = df["transcript"].tolist()
    ids = df["video_id"].tolist()
    metadatas = df[["video_id", "url"]].to_dict(orient="records")

    collection.add(
        documents=documents,
        ids=ids,
        metadatas=metadatas,
    )

    print(
        f"Added {len(documents)} transcripts to Chroma collection "
        f"'{collection_name}'."
    )
    return collection

Transcribe

In [None]:
video_urls = [
        "https://www.youtube.com/watch?v=enD8mK9Zvwo",
        "https://www.youtube.com/watch?v=ZdjJdoEwCY4",
    ]

    df_videos = ingest_youtube_videos(video_urls, languages=["en"])

    print("\nIngested videos DataFrame:")
    print(df_videos.head())

    if df_videos.empty:
        print("❌ No videos ingested successfully – check URLs or transcripts settings.")
        # raise SystemExit("No videos ingested successfully – check URLs or transcripts settings.")
    else:
        print("✅ At least one transcript ingested.")

In [None]:
df_videos.head()

URL ingestion for youtube links

Add Chunking + LangChain Documents on Top the Your DataFrame

In [None]:
!pip install langchain langchain-openai langchain-community langsmith

In [None]:
!pip install -q -U langchain langchain-openai langchain-core langchain-community
!pip install pytube


In [None]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pytube import YouTube  # to enrich metadata from YouTube


# Convert each row in df_videos (video_id, url, transcript) into multiple LangChain Documents with metadata.
def df_to_documents(
    df: pd.DataFrame,
    chunk_size: int = 1000,
    chunk_overlap: int = 150,
) -> list[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )

    docs: list[Document] = []

    for _, row in df.iterrows():
        video_id = row["video_id"]
        url = row["url"]
        transcript = row["transcript"]

        # Split transcript into chunks
        chunks = splitter.split_text(transcript)

        # Try to get richer metadata from YouTube, but fail gracefully
        title = ""
        author = ""
        description = ""
        try:
            yt = YouTube(url)
            title = yt.title or ""
            author = yt.author or ""
            description = yt.description or ""
        except Exception as e:
            print(f"Could not fetch metadata for {url}: {e}")

        for idx, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={
                    "video_id": video_id,
                    "url": url,
                    "title": title,
                    "author": author,
                    "description": description,
                    "chunk_index": idx,
                },
            )
            docs.append(doc)

    return docs

In [None]:
documents = df_to_documents(df_videos)
print(f"Created {len(documents)} chunks from {len(df_videos)} videos.")

Build a LangChain VectorStore (Chroma) from Documents

In [None]:
import os
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings  # requires OPENAI_API_KEY

def build_vectorstore_from_documents(
    docs: list[Document],
    collection_name: str = "youtube_rag",
    persist_directory: str | None = None,
):
    """
    Build a Chroma vector store from LangChain Documents.
    Uses OpenAI embeddings by default.
    """
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    vectorstore = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        collection_name=collection_name,
        persist_directory=persist_directory,
    )
    return vectorstore

In [None]:
documents = df_to_documents(df_videos)
vectorstore = build_vectorstore_from_documents(
    documents,
    collection_name="youtube_rag",
    persist_directory="./chroma_youtube_rag",
)