<a href="https://colab.research.google.com/github/kr5red/Project-4-Business-Case-Multimodal-AI-ChatBot-for-YouTube-Video-QA/blob/main/main_version2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# YouTube RAG Pipeline

Pipeline

## 0. YouTube videos / transcripts

Imports

In [18]:
pip install youtube-transcript-api chromadb



In [19]:
!pip install langchain langchain-openai langchain-community langsmith
!pip install -q -U langchain langchain-openai langchain-core langchain-community
!pip install pytube



In [20]:
from urllib.parse import urlparse, parse_qs

from youtube_transcript_api import (
    YouTubeTranscriptApi,
    TranscriptsDisabled,
    NoTranscriptFound,
)

import pandas as pd

YouTube ingestion
- URL -> video_id
- video_id -> transcript (list)
- transcript -> plain text

In [21]:
#Extract the YouTube video ID from URL formats
def extract_video_id(url: str) -> str:
    parsed = urlparse(url)

    # Short youtu.be links
    if parsed.netloc in ("youtu.be", "www.youtu.be"):
        return parsed.path.lstrip("/")

    # Regular youtube.com links
    if parsed.netloc in ("www.youtube.com", "youtube.com", "m.youtube.com"):
        qs = parse_qs(parsed.query)
        vid = qs.get("v", [None])[0]
        if vid:
            return vid

    raise ValueError(f"Could not extract video_id from URL: {url}")

#Convert a transcript (list of {text, start, duration}) to a single text string
def transcript_to_text(transcript, include_timestamps: bool = False) -> str:
    lines = []
    for entry in transcript:
        if include_timestamps:
            start = entry["start"]
            lines.append(f"[{start:.1f}s] {entry['text']}")
        else:
            lines.append(entry["text"])
    return " ".join(lines)


#Fetch transcript for a single video_id and turn it into plain text.
def fetch_transcript_text(video_id: str, languages=None) -> str:
    try:
        ytt_api = YouTubeTranscriptApi()

        # If you don't care about language, you can call ytt_api.fetch(video_id) without languages
        if languages is None:
            fetched = ytt_api.fetch(video_id)
        else:
            fetched = ytt_api.fetch(video_id, languages=languages)

        # `fetched` is a FetchedTranscript object with `.snippets`
        # Convert to the same structure transcript_to_text() expects
        transcript = [
            {"text": s.text, "start": s.start, "duration": s.duration}
            for s in fetched.snippets
        ]

        return transcript_to_text(transcript, include_timestamps=False)

    except TranscriptsDisabled:
        raise RuntimeError(f"Transcripts are disabled for video_id={video_id}")
    except NoTranscriptFound:
        raise RuntimeError(f"No transcript found for video_id={video_id} in languages={languages}")
    except Exception as e:
        raise RuntimeError(f"Error fetching transcript for {video_id}: {e}")


Ingest YouTube videos into a DataFrame

In [22]:
def ingest_youtube_videos(urls, languages="en, de") -> pd.DataFrame:
    rows = []
    for url in urls:
        try:
            video_id = extract_video_id(url)
            transcript = fetch_transcript_text(video_id, languages=languages)
            rows.append({
                "video_id": video_id,
                "url": url,
                "transcript": transcript,
            })
        except Exception as e:
            print(f"Skipping {url}: {e}")
    return pd.DataFrame(rows)


In [23]:
#Ingest multiple videos ----
video_urls = [
    "https://www.youtube.com/watch?v=HG68Ymazo18",
]

df_videos = ingest_youtube_videos(video_urls, languages=["en"])

if df_videos.empty:
    print("No videos ingested ...")
else:
    print(df_videos.head())

      video_id                                          url  \
0  HG68Ymazo18  https://www.youtube.com/watch?v=HG68Ymazo18   

                                          transcript  
0  Arguably, the most crucial\npart of the job se...  


In [24]:
df_videos.head()

Unnamed: 0,video_id,url,transcript
0,HG68Ymazo18,https://www.youtube.com/watch?v=HG68Ymazo18,"Arguably, the most crucial\npart of the job se..."


URL ingestion for youtube links

Add Chunking + LangChain Documents on Top the Your DataFrame

In [10]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

from pytube import YouTube  # to enrich metadata from YouTube

#Convert each row in df_videos (video_id, url, transcript) into multiple LangChain Documents with metadata.
def df_to_documents(
    df: pd.DataFrame,
    chunk_size: int = 1000,
    chunk_overlap: int = 150,
) -> list[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )

    docs: list[Document] = []

    for _, row in df.iterrows():
        video_id = row["video_id"]
        url = row["url"]
        transcript = row["transcript"]

        # Try to fetch some metadata from YouTube
        title = author = description = None
        try:
            yt = YouTube(url)
            title = yt.title
            author = yt.author
            description = yt.description
        except Exception:
            pass

        # Split transcript into chunks
        chunks = splitter.split_text(transcript)

        for idx, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={
                    "video_id": video_id,
                    "url": url,
                    "title": title,
                    "author": author,
                    "description": description,
                    "chunk_index": idx,
                },
            )
            docs.append(doc)

    return docs


In [11]:
documents = df_to_documents(df_videos)
print(f"Created {len(documents)} chunks from {len(df_videos)} videos.")

Created 6 chunks from 1 videos.


Build a LangChain VectorStore (Chroma) from Documents

In [15]:
import os
from google.colab import userdata  # this is what reads Colab secrets

# Get the key from Colab Secrets
openai_key = userdata.get("OPENAI_API_KEY")

if openai_key is None:
    raise ValueError("OPENAI_API_KEY not found in Colab secrets. Check the name.")

# Option A: set as environment variable so LangChain & others can use it
os.environ["OPENAI_API_KEY"] = openai_key

print("Loaded OPENAI_API_KEY from Colab secrets:", os.environ["OPENAI_API_KEY"] is not None)

Loaded OPENAI_API_KEY from Colab secrets: True


In [16]:
import os
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings  # requires OPENAI_API_KEY

# Or keep using SentenceTransformer embeddings if you prefer local:
# from langchain_community.embeddings import HuggingFaceEmbeddings

def build_vectorstore_from_documents(
    docs: list[Document],
    collection_name: str = "youtube_rag",
    persist_directory: str | None = None,
):
    """
    Build a Chroma vector store from LangChain Documents.
    Uses OpenAI embeddings by default.
    """
    # OpenAI embedding model
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    vectorstore = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        collection_name=collection_name,
        persist_directory=persist_directory,  # can be None for in-memory
    )
    return vectorstore

In [17]:
documents = df_to_documents(df_videos)
vectorstore = build_vectorstore_from_documents(
    documents,
    collection_name="youtube_rag",
    persist_directory="./chroma_youtube_rag",
)