<a href="https://colab.research.google.com/github/kr5red/Project-4-Business-Case-Multimodal-AI-ChatBot-for-YouTube-Video-QA/blob/main/main_lastversion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# YouTube RAG Pipeline

Pipeline
1. **Loading Data**
2. **Chunking**
3. **Embeddings** (OpenAI-Embeddings)
4. **VectorDB** (Chroma)
5. **Retriever**
6. **LLM** (OpenAI Chat-Model)
7. **Chain** (Conversational Retrieval)
8. **Memory**


## 0. YouTube videos / transcripts

Imports

In [3]:
pip install youtube-transcript-api chromadb sentence-transformers transformers accelerate torch



In [1]:
!pip install langchain langchain-openai langchain-community langsmith
!pip install -q -U langchain langchain-openai langchain-core langchain-community
!pip install pytube

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.8/101.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0


In [2]:
from urllib.parse import urlparse, parse_qs

from youtube_transcript_api import (
    YouTubeTranscriptApi,
    TranscriptsDisabled,
    NoTranscriptFound,
)

import pandas as pd
import chromadb
from chromadb.utils import embedding_functions

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

YouTube ingestion
- URL -> video_id
- video_id -> transcript (list)
- transcript -> plain text

In [3]:
#Extract the YouTube video ID from URL formats
def extract_video_id(url: str) -> str:
    parsed = urlparse(url)

    # Short youtu.be links
    if parsed.netloc in ("youtu.be", "www.youtu.be"):
        return parsed.path.lstrip("/")

    # Regular youtube.com links
    if parsed.netloc in ("www.youtube.com", "youtube.com", "m.youtube.com"):
        qs = parse_qs(parsed.query)
        vid = qs.get("v", [None])[0]
        if vid:
            return vid

    raise ValueError(f"Could not extract video_id from URL: {url}")

#Convert a transcript (list of {text, start, duration}) to a single text string
def transcript_to_text(transcript, include_timestamps: bool = False) -> str:
    lines = []
    for entry in transcript:
        if include_timestamps:
            start = entry["start"]
            lines.append(f"[{start:.1f}s] {entry['text']}")
        else:
            lines.append(entry["text"])
    return " ".join(lines)


#Fetch transcript for a single video_id and turn it into plain text.
def fetch_transcript_text(video_id: str, languages=None) -> str:
    try:
        ytt_api = YouTubeTranscriptApi()

        # If you don't care about language, you can call ytt_api.fetch(video_id) without languages
        if languages is None:
            fetched = ytt_api.fetch(video_id)
        else:
            fetched = ytt_api.fetch(video_id, languages=languages)

        # `fetched` is a FetchedTranscript object with `.snippets`
        # Convert to the same structure transcript_to_text() expects
        transcript = [
            {"text": s.text, "start": s.start, "duration": s.duration}
            for s in fetched.snippets
        ]

        return transcript_to_text(transcript, include_timestamps=False)

    except TranscriptsDisabled:
        raise RuntimeError(f"Transcripts are disabled for video_id={video_id}")
    except NoTranscriptFound:
        raise RuntimeError(f"No transcript found for video_id={video_id} in languages={languages}")
    except Exception as e:
        raise RuntimeError(f"Error fetching transcript for {video_id}: {e}")


Store transcripts in Chroma

In [4]:
#Create a Chroma collection from a DataFrame with columns 'video_id', 'url', 'transcript'
def build_chroma_collection_from_df(df: pd.DataFrame, collection_name: str = "youtube_videos"):

    # 2.1 Set up Chroma client (in-memory for now; for persistence use PersistentClient)
    client = chromadb.Client()

    # 2.2 Define an embedding function (SentenceTransformer)
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )

    # 2.3 Create (or recreate) the collection
    # If collection exists, delete it to start fresh
    existing = [c.name for c in client.list_collections()]
    if collection_name in existing:
        client.delete_collection(collection_name)

    collection = client.create_collection(
        name=collection_name,
        embedding_function=embedding_func,
    )

    # 2.4 Add documents to collection
    # Use video_id as id, transcript as document
    documents = df["transcript"].tolist()
    ids = df["video_id"].tolist()
    metadatas = df[["video_id", "url"]].to_dict(orient="records")

    collection.add(
        documents=documents,
        ids=ids,
        metadatas=metadatas,
    )

    print(f"Added {len(documents)} transcripts to Chroma collection '{collection_name}'.")
    return collection

In [5]:
def ingest_youtube_videos(urls, languages="en, de") -> pd.DataFrame:
    rows = []
    for url in urls:
        try:
            video_id = extract_video_id(url)
            transcript = fetch_transcript_text(video_id, languages=languages)
            rows.append({
                "video_id": video_id,
                "url": url,
                "transcript": transcript,
            })
        except Exception as e:
            print(f"Skipping {url}: {e}")
    return pd.DataFrame(rows)


Transcribe

In [6]:
if __name__ == "__main__":
    # ---- 1) Ingest multiple videos ----
    video_urls = [
        # add YouTube URLs here:
        "https://www.youtube.com/watch?v=enD8mK9Zvwo",
        "https://www.youtube.com/watch?v=ZdjJdoEwCY4",

    ]

    df_videos = ingest_youtube_videos(video_urls, languages=["en"])
    print("\nIngested videos DataFrame:")
    print(df_videos.head())

    if df_videos.empty:
        raise SystemExit("No videos ingested successfully – check URLs or transcripts settings.")

    # ---- 2) Build Chroma collection ----
    collection = build_chroma_collection_from_df(df_videos, collection_name="youtube_videos")

    print("\nChroma collection is ready.")


Ingested videos DataFrame:
      video_id                                          url  \
0  enD8mK9Zvwo  https://www.youtube.com/watch?v=enD8mK9Zvwo   
1  ZdjJdoEwCY4  https://www.youtube.com/watch?v=ZdjJdoEwCY4   

                                          transcript  
0  One of the most important parts of a job appli...  
1  so going on a job interview has got to be one ...  


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Added 2 transcripts to Chroma collection 'youtube_videos'.

Chroma collection is ready.


In [7]:
df_videos.head()

Unnamed: 0,video_id,url,transcript
0,enD8mK9Zvwo,https://www.youtube.com/watch?v=enD8mK9Zvwo,One of the most important parts of a job appli...
1,ZdjJdoEwCY4,https://www.youtube.com/watch?v=ZdjJdoEwCY4,so going on a job interview has got to be one ...


URL ingestion for youtube links

Add Chunking + LangChain Documents on Top the Your DataFrame

In [8]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

from pytube import YouTube  # to enrich metadata from YouTube

#Convert each row in df_videos (video_id, url, transcript) into multiple LangChain Documents with metadata.
def df_to_documents(
    df: pd.DataFrame,
    chunk_size: int = 1000,
    chunk_overlap: int = 150,
) -> list[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )

    docs: list[Document] = []

    for _, row in df.iterrows():
        video_id = row["video_id"]
        url = row["url"]
        transcript = row["transcript"]

        # Try to fetch some metadata from YouTube
        title = author = description = None
        try:
            yt = YouTube(url)
            title = yt.title
            author = yt.author
            description = yt.description
        except Exception:
            pass

        # Split transcript into chunks
        chunks = splitter.split_text(transcript)

        for idx, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={
                    "video_id": video_id,
                    "url": url,
                    "title": title,
                    "author": author,
                    "description": description,
                    "chunk_index": idx,
                },
            )
            docs.append(doc)

    return docs


In [9]:
documents = df_to_documents(df_videos)
print(f"Created {len(documents)} chunks from {len(df_videos)} videos.")

Created 9 chunks from 2 videos.


Build a LangChain VectorStore (Chroma) from Documents

In [10]:
import os
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings  # requires OPENAI_API_KEY

# Or keep using SentenceTransformer embeddings if you prefer local:
# from langchain_community.embeddings import HuggingFaceEmbeddings

def build_vectorstore_from_documents(
    docs: list[Document],
    collection_name: str = "youtube_rag",
    persist_directory: str | None = None,
):
    """
    Build a Chroma vector store from LangChain Documents.
    Uses OpenAI embeddings by default.
    """
    # OpenAI embedding model
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    vectorstore = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        collection_name=collection_name,
        persist_directory=persist_directory,  # can be None for in-memory
    )
    return vectorstore

In [12]:
import os
from google.colab import userdata  # this is what reads Colab secrets

# Get the key from Colab Secrets
openai_key = userdata.get("OPENAI_API_KEY")

if openai_key is None:
    raise ValueError("OPENAI_API_KEY not found in Colab secrets. Check the name.")

# Option A: set as environment variable so LangChain & others can use it
os.environ["OPENAI_API_KEY"] = openai_key

print("Loaded OPENAI_API_KEY from Colab secrets:", os.environ["OPENAI_API_KEY"] is not None)

Loaded OPENAI_API_KEY from Colab secrets: True


In [13]:
documents = df_to_documents(df_videos)
vectorstore = build_vectorstore_from_documents(
    documents,
    collection_name="youtube_rag",
    persist_directory="./chroma_youtube_rag",
)