# YouTube RAG Pipeline

0. Installments & Imports
1. Ingest YouTube Videos → DataFrame
2. Convert Transcripts → LangChain Documents
3. Build Vector Store (Chroma + OpenAI Embeddings)
4. (Next Steps – Implemented in Later Cells)

## 0. Installments & Imports

In [None]:
!pip install youtube-transcript-api chromadb pytube
!pip install -q -U langchain langchain-openai langchain-core langchain-community langsmith
!pip install -q openai



In [None]:
import os
import glob
import pandas as pd
from urllib.parse import urlparse, parse_qs

# Colab secrets
from google.colab import userdata

# LangChain imports (modern API)
from langchain_openai import ChatOpenAI
from langchain_core.tools import Tool, tool
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage, SystemMessage
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import BaseMessage
from langsmith import Client
from langsmith.evaluation import evaluate
from langchain_core.runnables import RunnableLambda

# Text splitting + Documents
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# VectorDB
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# YouTube transcript
from youtube_transcript_api import (
    YouTubeTranscriptApi,
    TranscriptsDisabled,
    NoTranscriptFound,
)

# Metadata enrichment
from pytube import YouTube

#Speech recognition
from google.colab import files
from openai import OpenAI
import uuid

#Tools
from collections import defaultdict
from typing import Dict, List

#deployment
import gradio as gr

#evaluation
import numpy as np
from rouge_score import rouge_scorer
import sacrebleu

In [None]:
from openai import OpenAI

openai_key = userdata.get("OPENAI_API_KEY")
langchain_key = userdata.get("LANGCHAIN_API_KEY")

if openai_key is None:
    raise ValueError("OPENAI_API_KEY not found in Colab secrets.")
if langchain_key is None:
    raise ValueError("LANGCHAIN_API_KEY not found in Colab secrets.")

os.environ["OPENAI_API_KEY"] = openai_key
os.environ["LANGCHAIN_API_KEY"] = langchain_key
os.environ["LANGCHAIN_TRACING_V2"] = "true"    # required for LangSmith
os.environ["LANGCHAIN_PROJECT"] = "youtube-qa-bot"

# Reusable OpenAI client for audio transcription etc.
openai_client = OpenAI()

print("Keys loaded")
print("LangSmith enabled — project:", os.environ["LANGCHAIN_PROJECT"])

Keys loaded
LangSmith enabled — project: youtube-qa-bot


In [None]:
openai_client = OpenAI()
print("OpenAI client initialized!")

OpenAI client initialized!


YouTube ingestion
- URL -> video_id
- video_id -> transcript (list)
- transcript -> plain text

In [None]:
#Extract the YouTube video ID from URL formats
def extract_video_id(url: str) -> str:
    parsed = urlparse(url)

    # Short youtu.be links
    if parsed.netloc in ("youtu.be", "www.youtu.be"):
        return parsed.path.lstrip("/")

    # Regular youtube.com links
    if parsed.netloc in ("www.youtube.com", "youtube.com", "m.youtube.com"):
        qs = parse_qs(parsed.query)
        vid = qs.get("v", [None])[0]
        if vid:
            return vid

    raise ValueError(f"Could not extract video_id from URL: {url}")

#Convert a transcript (list of {text, start, duration}) to a single text string
def transcript_to_text(transcript, include_timestamps: bool = False) -> str:
    lines = []
    for entry in transcript:
        if include_timestamps:
            start = entry["start"]
            lines.append(f"[{start:.1f}s] {entry['text']}")
        else:
            lines.append(entry["text"])
    return " ".join(lines)


#Fetch transcript for a single video_id and turn it into plain text.
def fetch_transcript_text(video_id: str, languages=None) -> str:
    try:
        ytt_api = YouTubeTranscriptApi()

        # If you don't care about language, you can call ytt_api.fetch(video_id) without languages
        if languages is None:
            fetched = ytt_api.fetch(video_id)
        else:
            fetched = ytt_api.fetch(video_id, languages=languages)

        # `fetched` is a FetchedTranscript object with `.snippets`
        # Convert to the same structure transcript_to_text() expects
        transcript = [
            {"text": s.text, "start": s.start, "duration": s.duration}
            for s in fetched.snippets
        ]

        return transcript_to_text(transcript, include_timestamps=False)

    except TranscriptsDisabled:
        raise RuntimeError(f"Transcripts are disabled for video_id={video_id}")
    except NoTranscriptFound:
        raise RuntimeError(f"No transcript found for video_id={video_id} in languages={languages}")
    except Exception as e:
        raise RuntimeError(f"Error fetching transcript for {video_id}: {e}")


Ingest YouTube videos into a DataFrame

In [None]:
def ingest_youtube_videos(urls, languages=None, save_dir="transcripts"):
    """
    Fetches YouTube transcripts and automatically saves each one as a .txt file.
    If a transcript file already exists locally, it will be loaded instead of calling YouTube again.
    """
    os.makedirs(save_dir, exist_ok=True)
    rows = []
    for url in urls:
        video_id = extract_video_id(url)
        txt_path = os.path.join(save_dir, f"{video_id}.txt")
        # Load local transcript if available
        if os.path.exists(txt_path):
            print(f":starkes_häkchen: Loading saved transcript for {video_id}")
            with open(txt_path, "r", encoding="utf-8") as f:
                transcript = f.read()
        else:
            # Fetch from YouTube
            try:
                print(f"↻ Fetching transcript from YouTube for {video_id}...")
                transcript = fetch_transcript_text(video_id, languages=languages)
                # Save to .txt
                with open(txt_path, "w", encoding="utf-8") as f:
                    f.write(transcript)
                print(f":diskette: Transcript saved at {txt_path}")
            except Exception as e:
                print(f":warnung: Skipping {url}: {e}")
                transcript = ""
        rows.append({
            "video_id": video_id,
            "url": url,
            "transcript": transcript,
        })
    return pd.DataFrame(rows)

In [None]:
def load_local_transcripts(urls, save_dir="transcripts"):
    rows = []
    for url in urls:
        video_id = extract_video_id(url)
        txt_path = os.path.join(save_dir, f"{video_id}.txt")
        if not os.path.exists(txt_path):
            print(f":warnung: No local transcript found for {video_id}")
            transcript = ""
        else:
            print(f":starkes_häkchen: Loading local transcript for {video_id}")
            with open(txt_path, "r", encoding="utf-8") as f:
                transcript = f.read()
        rows.append({
            "video_id": video_id,
            "url": url,
            "transcript": transcript,
        })
    return pd.DataFrame(rows)

In [None]:
#Ingest multiple videos ----
video_urls = [
    "https://www.youtube.com/watch?v=a43Je1KQY3s", #
    "https://www.youtube.com/watch?v=ck5nw7R1uEs", # FAANG Interview
    "https://www.youtube.com/watch?v=jXXOI01IuPs", # What is happening in big tech interviews 2025
    "https://www.youtube.com/watch?v=YUL8ayPe1r8", # Outdated resume experience
    "https://www.youtube.com/watch?v=pjqi_M3SPwY", # Resume mistakes to avoid
    "https://www.youtube.com/watch?v=mmQcX6HpCGs", # Interview
    "https://www.youtube.com/watch?v=dG3TdJn7JP4", # Interview
    "https://www.youtube.com/watch?v=WdyiUe7_3cA", # Interview
    "https://www.youtube.com/watch?v=m-pjMa43tho", # Interview
    "https://www.youtube.com/watch?v=8OOvvJ0hd3M"
]

df_videos = ingest_youtube_videos(video_urls, languages=["en", "de"], save_dir="transcripts"
)

if df_videos.empty:
    print("No videos ingested ...")
else:
    print(df_videos.head())

↻ Fetching transcript from YouTube for a43Je1KQY3s...
:warnung: Skipping https://www.youtube.com/watch?v=a43Je1KQY3s: Error fetching transcript for a43Je1KQY3s: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=a43Je1KQY3s! This is most likely caused by:

YouTube is blocking requests from your IP. This usually is due to one of the following reasons:
- You have done too many requests and your IP has been blocked by YouTube
- You are doing requests from an IP belonging to a cloud provider (like AWS, Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud providers are blocked by YouTube.

There are two things you can do to work around this:
1. Use proxies to hide your IP address, as explained in the "Working around IP bans" section of the README (https://github.com/jdepoix/youtube-transcript-api?tab=readme-ov-file#working-around-ip-bans-requestblocked-or-ipblocked-exception).
2. (NOT RECOMMENDED) If you authenticate your requests using cookies

Loading saved transcription as txt. files

In [None]:
from google.colab import files

# 1. Open a file upload dialog (you can select all 10 .txt files at once)
uploaded = files.upload()

# 2. Read each uploaded file and store its contents in a dictionary
texts = {}

for filename, raw_data in uploaded.items():
    # raw_data is binary → decode to UTF-8 text
    texts[filename] = raw_data.decode('utf-8')

# 3. Print a list of successfully loaded files
print("Loaded files:")
for name in texts:
    print(" -", name)


Saving pjqi_M3SPwY.txt to pjqi_M3SPwY (1).txt
Saving mmQcX6HpCGs.txt to mmQcX6HpCGs (1).txt
Saving m-pjMa43tho.txt to m-pjMa43tho (1).txt
Saving jXXOI01IuPs.txt to jXXOI01IuPs (1).txt
Saving dG3TdJn7JP4.txt to dG3TdJn7JP4 (1).txt
Saving ck5nw7R1uEs.txt to ck5nw7R1uEs (1).txt
Saving a43Je1KQY3s.txt to a43Je1KQY3s (1).txt
Saving YUL8ayPe1r8.txt to YUL8ayPe1r8 (1).txt
Saving WdyiUe7_3cA.txt to WdyiUe7_3cA (1).txt
Saving 8OOvvJ0hd3M.txt to 8OOvvJ0hd3M (1).txt
Loaded files:
 - pjqi_M3SPwY (1).txt
 - mmQcX6HpCGs (1).txt
 - m-pjMa43tho (1).txt
 - jXXOI01IuPs (1).txt
 - dG3TdJn7JP4 (1).txt
 - ck5nw7R1uEs (1).txt
 - a43Je1KQY3s (1).txt
 - YUL8ayPe1r8 (1).txt
 - WdyiUe7_3cA (1).txt
 - 8OOvvJ0hd3M (1).txt


In [None]:
import re

# 1. Map: video_id -> transcript text (clean filename: remove ".txt" and " (1)" etc.)
id_to_text = {}

for filename, content in texts.items():
    # remove .txt
    base = filename[:-4] if filename.endswith(".txt") else filename
    # remove a suffix like " (1)", " (2)", etc.
    base = re.sub(r" \(\d+\)$", "", base)
    video_id = base
    id_to_text[video_id] = content

print("Cleaned video_ids from filenames:")
for vid in id_to_text.keys():
    print(" -", vid)


Cleaned video_ids from filenames:
 - pjqi_M3SPwY
 - mmQcX6HpCGs
 - m-pjMa43tho
 - jXXOI01IuPs
 - dG3TdJn7JP4
 - ck5nw7R1uEs
 - a43Je1KQY3s
 - YUL8ayPe1r8
 - WdyiUe7_3cA
 - 8OOvvJ0hd3M


In [None]:
rows = []

for url in video_urls:
    video_id = extract_video_id(url)
    transcript = id_to_text.get(video_id, "")
    if not transcript:
        print(f"⚠️ Warning: no transcript found for video_id={video_id}")
    rows.append({
        "video_id": video_id,
        "url": url,
        "transcript": transcript,
    })

df_videos = pd.DataFrame(rows)
df_videos.head()


Unnamed: 0,video_id,url,transcript
0,a43Je1KQY3s,https://www.youtube.com/watch?v=a43Je1KQY3s,hey does your resume feel outdated or invisibl...
1,ck5nw7R1uEs,https://www.youtube.com/watch?v=ck5nw7R1uEs,3 million that is the number of rums Google re...
2,jXXOI01IuPs,https://www.youtube.com/watch?v=jXXOI01IuPs,Something strange is happening with tech inter...
3,YUL8ayPe1r8,https://www.youtube.com/watch?v=YUL8ayPe1r8,Your resume isn't working because you're follo...
4,pjqi_M3SPwY,https://www.youtube.com/watch?v=pjqi_M3SPwY,all right so when I was applying to my first f...


Add Chunking + LangChain Documents on Top the Your DataFrame

In [None]:
#Convert each row in df_videos (video_id, url, transcript) into multiple LangChain Documents with metadata.
def df_to_documents(
    df: pd.DataFrame,
    chunk_size: int = 1000,
    chunk_overlap: int = 150,
) -> list[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )

    docs: list[Document] = []

    for _, row in df.iterrows():
        video_id = row["video_id"]
        url = row["url"]
        transcript = row["transcript"]

        # Try to fetch some metadata from YouTube
        title = author = description = None
        try:
            yt = YouTube(url)
            title = yt.title
            author = yt.author
            description = yt.description
        except Exception:
            pass

        # Split transcript into chunks
        chunks = splitter.split_text(transcript)

        for idx, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={
                    "video_id": video_id,
                    "url": url,
                    "title": title,
                    "author": author,
                    "description": description,
                    "chunk_index": idx,
                },
            )
            docs.append(doc)

    return docs


In [None]:
documents = df_to_documents(df_videos)
print(f"Created {len(documents)} chunks from {len(df_videos)} videos.")

Created 206 chunks from 10 videos.


Build a LangChain VectorStore (Chroma) from Documents

In [None]:
# from langchain_community.embeddings import HuggingFaceEmbeddings

def build_vectorstore_from_documents(
    docs: list[Document],
    collection_name: str = "youtube_rag",
    persist_directory: str | None = None,
):
    """
    Build a Chroma vector store from LangChain Documents.
    Uses OpenAI embeddings by default.
    """
    # OpenAI embedding model
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    vectorstore = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        collection_name=collection_name,
        persist_directory=persist_directory,  # can be None for in-memory
    )
    return vectorstore

In [None]:
vectorstore = build_vectorstore_from_documents(
    documents,
    collection_name="youtube_rag",
    persist_directory="./chroma_youtube_rag",
)

LLM + Retriever + Memory

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

memory = ChatMessageHistory()

rag_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant answering questions about the content "
            "of YouTube videos indexed in a vector database. "
            "Use the retrieved context to answer accurately."
        ),
        ("human", "Context from videos:\n{context}\n\nQuestion: {question}")
    ]
)

Build the RAG pipeline manually

In [None]:
def youtube_rag_query(question: str):
    """
    Full RAG pipeline:
    1. Retrieve relevant video chunks
    2. Add them into the prompt
    3. Call the LLM
    4. Store chat history (memory)
    """
    # ---- Retrieval ----
    docs = retriever.invoke(question)

    if not docs:
        context = "No relevant content found."
    else:
        context_parts = []
        for i, d in enumerate(docs):
            meta = d.metadata or {}
            context_parts.append(
                f"[{i+1}] Title: {meta.get('title', 'Unknown')}\n"
                f"Channel: {meta.get('author', 'Unknown')}\n"
                f"Description: {meta.get('description', 'No description')}\n"
                f"Chunk {meta.get('chunk_index', '?')}:\n{d.page_content}"
            )
        context = "\n\n---\n".join(context_parts)

    # ---- Build prompt ----
    prompt_msg = rag_prompt.format_messages(
        context=context,
        question=question,
    )

    # ---- LLM call ----
    response = llm.invoke(prompt_msg)

    # ---- Memory update ----
    memory.add_message(HumanMessage(content=question))
    memory.add_message(response)

    return response.content, context


In [None]:
# 🔧 Tools + Agent with Personality

# 1) Personality configuration
PERSONALITY = (
    "Friendly, encouraging, and concise. "
    "Explain things clearly and avoid jargon. "
    "If the user sounds stressed or insecure, be extra supportive."
)

# 2) Tools

@tool
def youtube_rag_qa(question: str) -> str:
    """Answer questions about the YouTube videos that have been ingested into the vector store."""
    answer, _ctx = youtube_rag_query(question)
    return answer

@tool
def transcribe_audio_file(audio_path: str) -> str:
    """Transcribe an audio file at a local path to text."""
    return transcribe_audio_to_text(audio_path)

tools = [youtube_rag_qa, transcribe_audio_file]
tool_map = {t.name: t for t in tools}

# LLM that can call tools
tool_llm = llm.bind_tools(tools)

# 3) Conversation memory (per session)
session_history: Dict[str, List[BaseMessage]] = defaultdict(list)

BASE_SYSTEM_PROMPT = (
    "You are an assistant that answers questions about a set of YouTube videos "
    "that have already been ingested into a vector database.\n"
    "If the user asks about the content of the videos, you should use the "
    "`youtube_rag_qa` tool.\n"
    "If the user gives or refers to an audio file path, you can use the "
    "`transcribe_audio_file` tool to turn it into text and then use "
    "`youtube_rag_qa` with the transcribed question.\n"
    "Always give concise, helpful answers."
)

def build_system_prompt() -> str:
    """Combine base instructions with the current personality."""
    if PERSONALITY:
        return BASE_SYSTEM_PROMPT + f"\n\nPersonality & tone: {PERSONALITY}"
    return BASE_SYSTEM_PROMPT

def agent_chat(user_input: str, session_id: str = "default") -> str:
    """
    Simple agent:
    - Uses OpenAI tool-calling to decide whether to call youtube_rag_qa / transcribe_audio_file
    - Keeps per-session memory of previous turns
    - Applies the configured personality on top of the base system prompt
    """
    history = session_history[session_id]

    # 1) Build the message list: system + history + new user message
    messages: List[BaseMessage] = [
        SystemMessage(content=build_system_prompt()),
        *history,
        HumanMessage(content=user_input),
    ]

    # 2) First model call: the model may decide to call tools
    ai_msg = tool_llm.invoke(messages)
    messages.append(ai_msg)

    # If the model already answered without tools, just return that
    if not getattr(ai_msg, "tool_calls", None):
        history.extend([HumanMessage(content=user_input), ai_msg])
        return ai_msg.content

    # 3) If there are tool calls, execute them
    tool_messages: List[ToolMessage] = []
    for call in ai_msg.tool_calls:
        tool_name = call["name"]
        tool_args = call["args"]
        tool_id = call["id"]

        tool = tool_map.get(tool_name)
        if tool is None:
            tool_result = f"Error: tool '{tool_name}' not found."
        else:
            # invoke() expects a dict of arguments for the tool
            tool_result = tool.invoke(tool_args)

        tool_messages.append(
            ToolMessage(
                content=str(tool_result),
                tool_call_id=tool_id,
            )
        )

    # 4) Second model call: model sees tool results and produces final answer
    messages.extend(tool_messages)
    final_ai = llm.invoke(messages)

    # 5) Update history with just the human input and final answer
    history.extend([
        HumanMessage(content=user_input),
        final_ai,
    ])

    return final_ai.content


Test the RAG pipeline

In [None]:
answer, used_context = youtube_rag_query("Give me an overview of the videos you indexed.")

print("=== ANSWER ===\n")
print(answer)

print("\n=== CONTEXT USED ===\n")
print(used_context)

=== ANSWER ===

The indexed videos appear to focus on job interview preparation and resume optimization. 

1. **Interview Questions**: One video emphasizes the importance of having questions prepared to ask at the end of an interview. It suggests that candidates inquire about the company, the role, and the team. The creator has also produced additional content on this topic and offers a free interview preparation checklist in the description.

2. **Engagement and Feedback**: Another video encourages viewers to engage with the content by commenting, liking, and subscribing. The creator expresses appreciation for feedback, which helps tailor future video topics to the audience's interests.

3. **Resume Optimization**: A third video discusses transforming a resume into a "highlight reel" rather than just a list of jobs. It advises viewers to focus on relevant skills and experiences that align with the job they are applying for, emphasizing the importance of using keywords and showcasing a

Chat loop



In [None]:
def chat_with_youtube_bot(session_id: str = "default"):
    """
    Simple text-based chat loop in Colab, powered by the tool-calling agent + memory.
    """
    print("YouTube QA ChatBot (agent powered)")
    print("Ask me anything about the videos I have indexed.")
    print("Type 'exit' or 'quit' to end the chat.\n")

    while True:
        user_input = input("You: ").strip()

        if user_input.lower() in {"exit", "quit"}:
            print("Bot: Bye!")
            break

        if not user_input:
            continue

        # ✅ Use the agent (tools + memory), not the bare RAG function
        answer = agent_chat(user_input, session_id=session_id)
        print(f"Bot: {answer}\n")


# Run this to start chatting:
chat_with_youtube_bot()


YouTube QA ChatBot (agent powered)
Ask me anything about the videos I have indexed.
Type 'exit' or 'quit' to end the chat.

You: exit
Bot: Bye!


Speech recognition

In [None]:
def transcribe_audio_to_text(audio_path: str, model: str = "gpt-4o-mini-transcribe") -> str:
    """
    Transcribe an audio file to text using OpenAI's speech recognition.
    Adjust model name if needed depending on what your account supports.
    """
    with open(audio_path, "rb") as audio_file:
        transcription = openai_client.audio.transcriptions.create(
            model=model,
            file=audio_file,
            response_format="text",
        )
    # For newer clients this is already a string; if it's an object, cast to str
    return str(transcription)


def tts_from_text(text: str, output_dir: str = "tts_outputs") -> str:
    """
    Convert answer text to speech using OpenAI TTS and return the audio file path.
    """
    os.makedirs(output_dir, exist_ok=True)
    filename = f"answer_{uuid.uuid4().hex}.mp3"
    out_path = os.path.join(output_dir, filename)

    speech = openai_client.audio.speech.create(
        model="gpt-4o-mini-tts",  # or another TTS-capable model
        voice="alloy",
        input=text,
    )

    with open(out_path, "wb") as f:
        f.write(speech.read())

    return out_path


In [None]:
def answer_from_audio_file(audio_path: str, session_id: str = "voice-session"):
    """
    Core backend logic:
    1. Transcribe audio file at `audio_path` to text
    2. Run the text through the agent (which internally uses RAG + tools + memory)
    3. Return (transcript, answer)
    """
    transcript_text = transcribe_audio_to_text(audio_path)
    answer = agent_chat(transcript_text, session_id=session_id)
    return transcript_text, answer


Change for Gradio later

In [None]:
def gradio_audio_qa(audio_path, tts_enabled: bool):
    """
    Gradio backend:
    - Takes an audio question (mic or upload)
    - Uses the agent-backed pipeline to get (transcript, answer)
    - Optionally generates spoken answer audio if tts_enabled is True
    """
    if audio_path is None:
        return "", "No audio received.", None

    # This already calls agent_chat under the hood
    transcript, answer = answer_from_audio_file(audio_path)

    answer_audio_path = None
    if tts_enabled:
        try:
            answer_audio_path = tts_from_text(answer)
        except Exception as e:
            print("TTS error:", e)

    return transcript, answer, answer_audio_path


with gr.Blocks() as demo:
    gr.Markdown("# 🎬 YouTube QA Bot with Voice Input")

    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone"],   # 👈 mic only
            type="filepath",          # pass a file path to backend
            label="Hold to record your question",
        )
        tts_toggle = gr.Checkbox(
            label="Read answer aloud",
            value=False,
        )

    transcript_output = gr.Textbox(
        label="Transcribed question",
        lines=2,
    )
    answer_output = gr.Textbox(
        label="Bot answer",
        lines=4,
    )
    answer_audio_output = gr.Audio(
        label="Spoken answer",
        type="filepath",
    )

    audio_input.change(
        fn=gradio_audio_qa,
        inputs=[audio_input, tts_toggle],
        outputs=[transcript_output, answer_output, answer_audio_output],
    )

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6556af2ff22a15e023.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# ------------ Gradio backend wrappers ------------

def gr_text_chat(message, history):
    """
    Backend for the text chat tab.

    message: latest user message (string)
    history: list of (user, bot) tuples for the Chatbot component
    """
    if not message:
        return history, ""  # nothing to do

    # Use your agent with tools + memory
    answer = agent_chat(message, session_id="gradio-text-session")

    # Append to visible chat history
    history = history + [(message, answer)]
    # Return new history and clear the input box
    return history, ""


def gradio_audio_qa(audio_path, tts_enabled: bool):
    """
    Gradio backend for the audio tab.

    - Takes an audio question (file path from mic/upload)
    - Uses your backend to get (transcript, answer)
    - Optionally generates spoken answer audio if tts_enabled is True
    """
    if audio_path is None:
        return "", "No audio received.", None

    # This already uses agent_chat under the hood
    transcript, answer = answer_from_audio_file(
        audio_path,
        session_id="gradio-audio-session",
    )

    answer_audio_path = None
    if tts_enabled:
        try:
            answer_audio_path = tts_from_text(answer)
        except Exception as e:
            print("TTS error:", e)
            answer_audio_path = None

    return transcript, answer, answer_audio_path


# ------------ Gradio UI (text + audio tabs) ------------

with gr.Blocks() as demo:
    gr.Markdown("# 🎥 YouTube Job Coach Bot\nAsk me anything about the indexed career videos.")

    # ===== TAB 1: TEXT CHAT =====
    with gr.Tab("💬 Text Chat"):
        chatbot = gr.Chatbot(
            label="Conversation",
            height=400,
        )
        msg = gr.Textbox(
            label="Your question",
            placeholder="Ask me about CVs, interviews, job search, etc.",
        )
        send_btn = gr.Button("Send")

        # User presses Enter in the textbox
        msg.submit(
            fn=gr_text_chat,
            inputs=[msg, chatbot],
            outputs=[chatbot, msg],
        )

        # User clicks the button
        send_btn.click(
            fn=gr_text_chat,
            inputs=[msg, chatbot],
            outputs=[chatbot, msg],
        )

    # ===== TAB 2: AUDIO QUESTION =====
    with gr.Tab("🎙️ Audio Question"):
        gr.Markdown(
            "Record or upload a spoken question. "
            "The bot will transcribe it, answer using the YouTube RAG pipeline, "
            "and optionally speak the answer."
        )

        with gr.Row():
            audio_input = gr.Audio(
                sources=["microphone"],   # mic only (you can add 'upload' if you like)
                type="filepath",          # pass a file path to backend
                label="Hold to record your question",
            )
            tts_toggle = gr.Checkbox(
                label="Read answer aloud",
                value=False,
            )

        transcript_output = gr.Textbox(
            label="Transcribed question",
            lines=2,
        )
        answer_output = gr.Textbox(
            label="Bot answer",
            lines=4,
        )
        answer_audio_output = gr.Audio(
            label="Spoken answer",
            type="filepath",
        )

        # Trigger on new/changed audio
        audio_input.change(
            fn=gradio_audio_qa,
            inputs=[audio_input, tts_toggle],
            outputs=[transcript_output, answer_output, answer_audio_output],
        )

# Launch the app
demo.launch(share=True)


  chatbot = gr.Chatbot(
  chatbot = gr.Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8fa335814db544b41f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# (Run this once in Colab to install metric libraries)
!pip install -q rouge-score sacrebleu

In [None]:
# ✅ 1) Tiny evaluation dataset of (question, expected answer) pairs

eval_examples = [
    {
        "question": "What is the main topic of the FAANG interview video?",
        "expected": "The video explains how FAANG interviews typically work and gives tips on how to prepare for them."
    },
    {
        "question": "According to the videos, what is one key recommendation for writing your CV or resume?",
        "expected": "You should tailor your CV or resume to each job and keep it clear and concise."
    },
    {
        "question": "What do the videos suggest you should do after being rejected from a job application?",
        "expected": "They recommend treating rejection as normal, asking for feedback when possible, and using it to improve your preparation."
    },
    {
        "question": "How do the videos describe the benefit of preparing behavioral stories in advance?",
        "expected": "Preparing behavioral stories in advance helps you answer interview questions more confidently and clearly, often using the STAR method."
    },
    {
        "question": "What is one tip mentioned about researching a company before an interview?",
        "expected": "You should research the company's products and culture so you can tailor your answers and show genuine interest."
    },
    # You can add or edit examples here (aim for 5–10 total).
]


# ✅ 2) Semantic similarity evaluation with embeddings

# Reuse the same embedding model type as your vectorstore
eval_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

def cosine_sim(a, b):
    a = np.array(a)
    b = np.array(b)
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))


def evaluate_example_semantic(example):
    question = example["question"]
    expected = example["expected"]

    # Use your existing RAG function (must already be defined earlier)
    answer, _ctx = youtube_rag_query(question)

    exp_vec = eval_embeddings.embed_query(expected)
    ans_vec = eval_embeddings.embed_query(answer)

    sim = cosine_sim(exp_vec, ans_vec)
    return {
        "question": question,
        "expected": expected,
        "answer": answer,
        "cosine_similarity": sim,
    }


semantic_results = [evaluate_example_semantic(ex) for ex in eval_examples]

print("=== Semantic similarity results ===")
for r in semantic_results:
    print("Q:", r["question"])
    print("Expected:", r["expected"])
    print("Answer:", r["answer"])
    print("Cosine similarity:", round(r["cosine_similarity"], 3))
    print("-" * 80)

avg_sim = sum(r["cosine_similarity"] for r in semantic_results) / len(semantic_results)
print("✅ Average semantic similarity:", round(avg_sim, 3))


# ✅ 3) BLEU & ROUGE-L metrics

rouge_scorer_obj = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

def eval_bleu_rouge(expected: str, answer: str):
    """
    Compute BLEU and ROUGE-L between a reference answer and the model answer.
    BLEU is reported as a percentage (0–100), ROUGE-L as F1 (0–1).
    """
    # sacrebleu expects [hypotheses], [[references]]
    bleu = sacrebleu.corpus_bleu([answer], [[expected]]).score

    rouge_scores = rouge_scorer_obj.score(expected, answer)
    rouge_l = rouge_scores["rougeL"].fmeasure

    return bleu, rouge_l


# ✅ 4) Evaluate all examples with BLEU + ROUGE-L

def evaluate_example_with_metrics(example):
    question = example["question"]
    expected = example["expected"]

    answer, _ctx = youtube_rag_query(question)

    bleu, rouge_l = eval_bleu_rouge(expected, answer)

    return {
        "question": question,
        "expected": expected,
        "answer": answer,
        "bleu": bleu,
        "rougeL": rouge_l,
    }

metric_results = [evaluate_example_with_metrics(ex) for ex in eval_examples]

print("\n=== BLEU & ROUGE-L results ===")
for r in metric_results:
    print("Q:", r["question"])
    print("BLEU:", round(r["bleu"], 2), "ROUGE-L:", round(r["rougeL"], 3))
    print("-" * 80)

avg_bleu = sum(r["bleu"] for r in metric_results) / len(metric_results)
avg_rougeL = sum(r["rougeL"] for r in metric_results) / len(metric_results)

print("✅ Average BLEU:", round(avg_bleu, 2))
print("✅ Average ROUGE-L:", round(avg_rougeL, 3))

=== Semantic similarity results ===
Q: What is the main topic of the FAANG interview video?
Expected: The video explains how FAANG interviews typically work and gives tips on how to prepare for them.
Answer: The main topic of the FAANG interview video revolves around the evolving landscape of tech interviews, particularly within major tech companies like Google, Amazon, and Meta. The video discusses how these companies are adapting their interview processes in response to changes in technology and candidate behavior, including the use of AI and advanced surveillance techniques during interviews. It highlights three major trends reshaping hiring practices, the shift from traditional in-person interviews to video interviews, and the challenges companies face in maintaining the integrity of their interview processes.
Cosine similarity: 0.717
--------------------------------------------------------------------------------
Q: According to the videos, what is one key recommendation for writi

In [None]:
# ✅ Wrap youtube_rag_query in a Runnable for LangSmith

def rag_run(inputs: dict) -> str:
    """
    Wrapper for youtube_rag_query so LangSmith can call it easily.
    Expects inputs like {"question": "..."} and returns the answer text.
    """
    question = inputs["question"]
    answer, _ctx = youtube_rag_query(question)
    return answer

rag_runnable = RunnableLambda(rag_run)

In [None]:
# ✅ LangSmith evaluation: create dataset if missing, then run evaluation

client = Client()
dataset_name = "youtube-qa-eval"

# 1) Ensure dataset exists (create if not)
if client.has_dataset(dataset_name=dataset_name):
    dataset = client.read_dataset(dataset_name=dataset_name)
    print(f"Dataset '{dataset_name}' already exists.")
else:
    print(f"Dataset '{dataset_name}' not found. Creating and populating it...")
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description="Evaluation dataset for YouTube QA bot.",
    )
    # Add our eval_examples as dataset entries
    for ex in eval_examples:
        client.create_example(
            inputs={"question": ex["question"]},
            outputs={"expected": ex["expected"]},
            dataset_id=dataset.id,
        )
    print(f"Created dataset and added {len(eval_examples)} examples.")

# 2) Run evaluation: no automatic evaluators, just log results to LangSmith
eval_results = evaluate(
    rag_runnable,
    data=dataset_name,            # dataset name
    experiment_prefix="youtube-qa",
)

print("✅ LangSmith evaluation complete. Check LangSmith (Datasets → Evaluations) for the run outputs and traces.")


Dataset 'youtube-qa-eval' already exists.
View the evaluation results for experiment: 'youtube-qa-183a331c' at:
https://smith.langchain.com/o/08e1da5a-9808-4fe7-b2aa-01d1a94c2d6b/datasets/46474ce6-a236-4c77-b7a9-a0e7bf2e9acd/compare?selectedSessions=f13c01dc-4720-45af-b211-9127ac524c67




0it [00:00, ?it/s]

✅ LangSmith evaluation complete. Check LangSmith (Datasets → Evaluations) for the run outputs and traces.
