<a href="https://colab.research.google.com/github/kr5red/Project-4-Business-Case-Multimodal-AI-ChatBot-for-YouTube-Video-QA/blob/main/main_version7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# YouTube RAG Pipeline

0. Installments & Imports
1. Ingest YouTube Videos → DataFrame
2. Convert Transcripts → LangChain Documents
3. Build Vector Store (Chroma + OpenAI Embeddings)
4. (Next Steps – Implemented in Later Cells)

## 0. Installments & Imports

In [2]:
!pip install youtube-transcript-api chromadb pytube
!pip install -q -U langchain langchain-openai langchain-core langchain-community langsmith
!pip install -q openai



In [3]:
import os
import glob
import pandas as pd
from urllib.parse import urlparse, parse_qs

# Colab secrets
from google.colab import userdata

# LangChain imports (modern API)
from langchain_openai import ChatOpenAI
from langchain_core.tools import Tool, tool
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage, SystemMessage
from langchain_community.chat_message_histories import ChatMessageHistory


# Text splitting + Documents
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# VectorDB
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# YouTube transcript
from youtube_transcript_api import (
    YouTubeTranscriptApi,
    TranscriptsDisabled,
    NoTranscriptFound,
)

# Metadata enrichment
from pytube import YouTube

#Speech recognition
from google.colab import files
from openai import OpenAI
import uuid

#Tools
from collections import defaultdict
from typing import Dict, List
from langchain_core.messages import BaseMessage

#deployment
import gradio as gr

In [6]:
from openai import OpenAI

openai_key = userdata.get("OPENAI_API_KEY")
langchain_key = userdata.get("LANGCHAIN_API_KEY")

if openai_key is None:
    raise ValueError("OPENAI_API_KEY not found in Colab secrets.")
if langchain_key is None:
    raise ValueError("LANGCHAIN_API_KEY not found in Colab secrets.")

os.environ["OPENAI_API_KEY"] = openai_key
os.environ["LANGCHAIN_API_KEY"] = langchain_key
os.environ["LANGCHAIN_TRACING_V2"] = "true"    # required for LangSmith
os.environ["LANGCHAIN_PROJECT"] = "youtube-qa-bot"

# Reusable OpenAI client for audio transcription etc.
openai_client = OpenAI()

print("Keys loaded")
print("LangSmith enabled — project:", os.environ["LANGCHAIN_PROJECT"])

Keys loaded
LangSmith enabled — project: youtube-qa-bot


YouTube ingestion
- URL -> video_id
- video_id -> transcript (list)
- transcript -> plain text

In [7]:
#Extract the YouTube video ID from URL formats
def extract_video_id(url: str) -> str:
    parsed = urlparse(url)

    # Short youtu.be links
    if parsed.netloc in ("youtu.be", "www.youtu.be"):
        return parsed.path.lstrip("/")

    # Regular youtube.com links
    if parsed.netloc in ("www.youtube.com", "youtube.com", "m.youtube.com"):
        qs = parse_qs(parsed.query)
        vid = qs.get("v", [None])[0]
        if vid:
            return vid

    raise ValueError(f"Could not extract video_id from URL: {url}")

#Convert a transcript (list of {text, start, duration}) to a single text string
def transcript_to_text(transcript, include_timestamps: bool = False) -> str:
    lines = []
    for entry in transcript:
        if include_timestamps:
            start = entry["start"]
            lines.append(f"[{start:.1f}s] {entry['text']}")
        else:
            lines.append(entry["text"])
    return " ".join(lines)


#Fetch transcript for a single video_id and turn it into plain text.
def fetch_transcript_text(video_id: str, languages=None) -> str:
    try:
        ytt_api = YouTubeTranscriptApi()

        # If you don't care about language, you can call ytt_api.fetch(video_id) without languages
        if languages is None:
            fetched = ytt_api.fetch(video_id)
        else:
            fetched = ytt_api.fetch(video_id, languages=languages)

        # `fetched` is a FetchedTranscript object with `.snippets`
        # Convert to the same structure transcript_to_text() expects
        transcript = [
            {"text": s.text, "start": s.start, "duration": s.duration}
            for s in fetched.snippets
        ]

        return transcript_to_text(transcript, include_timestamps=False)

    except TranscriptsDisabled:
        raise RuntimeError(f"Transcripts are disabled for video_id={video_id}")
    except NoTranscriptFound:
        raise RuntimeError(f"No transcript found for video_id={video_id} in languages={languages}")
    except Exception as e:
        raise RuntimeError(f"Error fetching transcript for {video_id}: {e}")


Ingest YouTube videos into a DataFrame

In [8]:
def ingest_youtube_videos(urls, languages=None, save_dir="transcripts"):
    """
    Fetches YouTube transcripts and automatically saves each one as a .txt file.
    If a transcript file already exists locally, it will be loaded instead of calling YouTube again.
    """
    os.makedirs(save_dir, exist_ok=True)
    rows = []
    for url in urls:
        video_id = extract_video_id(url)
        txt_path = os.path.join(save_dir, f"{video_id}.txt")
        # Load local transcript if available
        if os.path.exists(txt_path):
            print(f":starkes_häkchen: Loading saved transcript for {video_id}")
            with open(txt_path, "r", encoding="utf-8") as f:
                transcript = f.read()
        else:
            # Fetch from YouTube
            try:
                print(f"↻ Fetching transcript from YouTube for {video_id}...")
                transcript = fetch_transcript_text(video_id, languages=languages)
                # Save to .txt
                with open(txt_path, "w", encoding="utf-8") as f:
                    f.write(transcript)
                print(f":diskette: Transcript saved at {txt_path}")
            except Exception as e:
                print(f":warnung: Skipping {url}: {e}")
                transcript = ""
        rows.append({
            "video_id": video_id,
            "url": url,
            "transcript": transcript,
        })
    return pd.DataFrame(rows)

In [9]:
def load_local_transcripts(urls, save_dir="transcripts"):
    rows = []
    for url in urls:
        video_id = extract_video_id(url)
        txt_path = os.path.join(save_dir, f"{video_id}.txt")
        if not os.path.exists(txt_path):
            print(f":warnung: No local transcript found for {video_id}")
            transcript = ""
        else:
            print(f":starkes_häkchen: Loading local transcript for {video_id}")
            with open(txt_path, "r", encoding="utf-8") as f:
                transcript = f.read()
        rows.append({
            "video_id": video_id,
            "url": url,
            "transcript": transcript,
        })
    return pd.DataFrame(rows)

In [10]:
#Ingest multiple videos ----
video_urls = [
    "https://www.youtube.com/watch?v=a43Je1KQY3s", #
    "https://www.youtube.com/watch?v=ck5nw7R1uEs", # FAANG Interview
    "https://www.youtube.com/watch?v=jXXOI01IuPs", # What is happening in big tech interviews 2025
    "https://www.youtube.com/watch?v=YUL8ayPe1r8", # Outdated resume experience
    "https://www.youtube.com/watch?v=pjqi_M3SPwY", # Resume mistakes to avoid
    "https://www.youtube.com/watch?v=mmQcX6HpCGs", # Interview
    "https://www.youtube.com/watch?v=dG3TdJn7JP4", # Interview
    "https://www.youtube.com/watch?v=WdyiUe7_3cA", # Interview
    "https://www.youtube.com/watch?v=m-pjMa43tho", # Interview
    "https://www.youtube.com/watch?v=8OOvvJ0hd3M"
]

df_videos = ingest_youtube_videos(video_urls, languages=["en", "de"], save_dir="transcripts"
)

if df_videos.empty:
    print("No videos ingested ...")
else:
    print(df_videos.head())

↻ Fetching transcript from YouTube for a43Je1KQY3s...
:warnung: Skipping https://www.youtube.com/watch?v=a43Je1KQY3s: Error fetching transcript for a43Je1KQY3s: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=a43Je1KQY3s! This is most likely caused by:

YouTube is blocking requests from your IP. This usually is due to one of the following reasons:
- You have done too many requests and your IP has been blocked by YouTube
- You are doing requests from an IP belonging to a cloud provider (like AWS, Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud providers are blocked by YouTube.

There are two things you can do to work around this:
1. Use proxies to hide your IP address, as explained in the "Working around IP bans" section of the README (https://github.com/jdepoix/youtube-transcript-api?tab=readme-ov-file#working-around-ip-bans-requestblocked-or-ipblocked-exception).
2. (NOT RECOMMENDED) If you authenticate your requests using cookies

Loading saved transcription as txt. files

In [11]:
from google.colab import files

# 1. Open a file upload dialog (you can select all 10 .txt files at once)
uploaded = files.upload()

# 2. Read each uploaded file and store its contents in a dictionary
texts = {}

for filename, raw_data in uploaded.items():
    # raw_data is binary → decode to UTF-8 text
    texts[filename] = raw_data.decode('utf-8')

# 3. Print a list of successfully loaded files
print("Loaded files:")
for name in texts:
    print(" -", name)


Saving pjqi_M3SPwY.txt to pjqi_M3SPwY.txt
Saving mmQcX6HpCGs.txt to mmQcX6HpCGs.txt
Saving m-pjMa43tho.txt to m-pjMa43tho.txt
Saving jXXOI01IuPs.txt to jXXOI01IuPs.txt
Saving dG3TdJn7JP4.txt to dG3TdJn7JP4.txt
Saving ck5nw7R1uEs.txt to ck5nw7R1uEs.txt
Saving a43Je1KQY3s.txt to a43Je1KQY3s.txt
Saving YUL8ayPe1r8.txt to YUL8ayPe1r8.txt
Saving WdyiUe7_3cA.txt to WdyiUe7_3cA.txt
Saving 8OOvvJ0hd3M.txt to 8OOvvJ0hd3M.txt
Loaded files:
 - pjqi_M3SPwY.txt
 - mmQcX6HpCGs.txt
 - m-pjMa43tho.txt
 - jXXOI01IuPs.txt
 - dG3TdJn7JP4.txt
 - ck5nw7R1uEs.txt
 - a43Je1KQY3s.txt
 - YUL8ayPe1r8.txt
 - WdyiUe7_3cA.txt
 - 8OOvvJ0hd3M.txt


In [12]:
import re

# 1. Map: video_id -> transcript text (clean filename: remove ".txt" and " (1)" etc.)
id_to_text = {}

for filename, content in texts.items():
    # remove .txt
    base = filename[:-4] if filename.endswith(".txt") else filename
    # remove a suffix like " (1)", " (2)", etc.
    base = re.sub(r" \(\d+\)$", "", base)
    video_id = base
    id_to_text[video_id] = content

print("Cleaned video_ids from filenames:")
for vid in id_to_text.keys():
    print(" -", vid)


Cleaned video_ids from filenames:
 - pjqi_M3SPwY
 - mmQcX6HpCGs
 - m-pjMa43tho
 - jXXOI01IuPs
 - dG3TdJn7JP4
 - ck5nw7R1uEs
 - a43Je1KQY3s
 - YUL8ayPe1r8
 - WdyiUe7_3cA
 - 8OOvvJ0hd3M


In [13]:
rows = []

for url in video_urls:
    video_id = extract_video_id(url)
    transcript = id_to_text.get(video_id, "")
    if not transcript:
        print(f"⚠️ Warning: no transcript found for video_id={video_id}")
    rows.append({
        "video_id": video_id,
        "url": url,
        "transcript": transcript,
    })

df_videos = pd.DataFrame(rows)
df_videos.head()


Unnamed: 0,video_id,url,transcript
0,a43Je1KQY3s,https://www.youtube.com/watch?v=a43Je1KQY3s,hey does your resume feel outdated or invisibl...
1,ck5nw7R1uEs,https://www.youtube.com/watch?v=ck5nw7R1uEs,3 million that is the number of rums Google re...
2,jXXOI01IuPs,https://www.youtube.com/watch?v=jXXOI01IuPs,Something strange is happening with tech inter...
3,YUL8ayPe1r8,https://www.youtube.com/watch?v=YUL8ayPe1r8,Your resume isn't working because you're follo...
4,pjqi_M3SPwY,https://www.youtube.com/watch?v=pjqi_M3SPwY,all right so when I was applying to my first f...


Add Chunking + LangChain Documents on Top the Your DataFrame

In [14]:
#Convert each row in df_videos (video_id, url, transcript) into multiple LangChain Documents with metadata.
def df_to_documents(
    df: pd.DataFrame,
    chunk_size: int = 1000,
    chunk_overlap: int = 150,
) -> list[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )

    docs: list[Document] = []

    for _, row in df.iterrows():
        video_id = row["video_id"]
        url = row["url"]
        transcript = row["transcript"]

        # Try to fetch some metadata from YouTube
        title = author = description = None
        try:
            yt = YouTube(url)
            title = yt.title
            author = yt.author
            description = yt.description
        except Exception:
            pass

        # Split transcript into chunks
        chunks = splitter.split_text(transcript)

        for idx, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={
                    "video_id": video_id,
                    "url": url,
                    "title": title,
                    "author": author,
                    "description": description,
                    "chunk_index": idx,
                },
            )
            docs.append(doc)

    return docs


In [15]:
documents = df_to_documents(df_videos)
print(f"Created {len(documents)} chunks from {len(df_videos)} videos.")

Created 206 chunks from 10 videos.


Build a LangChain VectorStore (Chroma) from Documents

In [16]:
# from langchain_community.embeddings import HuggingFaceEmbeddings

def build_vectorstore_from_documents(
    docs: list[Document],
    collection_name: str = "youtube_rag",
    persist_directory: str | None = None,
):
    """
    Build a Chroma vector store from LangChain Documents.
    Uses OpenAI embeddings by default.
    """
    # OpenAI embedding model
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    vectorstore = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        collection_name=collection_name,
        persist_directory=persist_directory,  # can be None for in-memory
    )
    return vectorstore

In [17]:
vectorstore = build_vectorstore_from_documents(
    documents,
    collection_name="youtube_rag",
    persist_directory="./chroma_youtube_rag",
)

LLM + Retriever + Memory

In [19]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

memory = ChatMessageHistory()

rag_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant answering questions about the content "
            "of YouTube videos indexed in a vector database. "
            "Use the retrieved context to answer accurately."
        ),
        ("human", "Context from videos:\n{context}\n\nQuestion: {question}")
    ]
)

Build the RAG pipeline manually

In [22]:
def youtube_rag_query(question: str):
    """
    Full RAG pipeline:
    1. Retrieve relevant video chunks
    2. Add them into the prompt
    3. Call the LLM
    4. Store chat history (memory)
    """
    # ---- Retrieval ----
    docs = retriever.invoke(question)

    if not docs:
        context = "No relevant content found."
    else:
        context_parts = []
        for i, d in enumerate(docs):
            meta = d.metadata or {}
            context_parts.append(
                f"[{i+1}] Title: {meta.get('title', 'Unknown')}\n"
                f"Channel: {meta.get('author', 'Unknown')}\n"
                f"Description: {meta.get('description', 'No description')}\n"
                f"Chunk {meta.get('chunk_index', '?')}:\n{d.page_content}"
            )
        context = "\n\n---\n".join(context_parts)

    # ---- Build prompt ----
    prompt_msg = rag_prompt.format_messages(
        context=context,
        question=question,
    )

    # ---- LLM call ----
    response = llm.invoke(prompt_msg)

    # ---- Memory update ----
    memory.add_message(HumanMessage(content=question))
    memory.add_message(response)

    return response.content, context


Personality

In [23]:
# Define LLM
llm = ChatOpenAI(
    model="gpt-4.1-mini",
    temperature=0.6,
)

# System prompt
system_prompt = """
You are a cheerful and helpful assistant for questions about the content of provided youtube videos.

Your personality
- You reply about the video material provided to you in a kind manner.
- You explain efficiently.
- You are honest about actual contexts in the videos.

Important:
- Always respond with the same language the user is using to ask questions.
- Only use the information provided in the context.
- If something is not in the context, please mention so and don't make things up.

"""

# Prompt template
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        (
            "human",
            "question of user:\n{question}\n\n"
            "relevant context from youtube videos:\n{context}"
        ),
    ]
)

# Retriever from vector store
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# RAG-Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True,
)

# Helper function
def ask(question: str):
    """
    Ask the chatbot. The question can be in any language.
    The chatbot will answer in the language the user is using.
    """
    result = qa_chain.invoke({"query": question})
    print(result["result"])
    return result


NameError: name 'RetrievalQA' is not defined

Job Agent and Tools

In [None]:
#Tool functions

def rag_search(query: str) -> str:
    """Searches the internal knowledge base for relevant information."""
    docs = retriever.get_relevant_documents(query)
    if not docs:
        return "No relevant information found."

    results = []
    for i, doc in enumerate(docs[:3], 1):
        source = doc.metadata.get('source', 'Unknown')
        content = doc.page_content[:300]
        results.append(f"[Source {i}: {source}]\n{content}...")

    return "\n\n".join(results)


def generate_interview_questions(job_role: str) -> str:
    """Generates common interview questions for a specific job role."""
    common_questions = [
        "Tell me about yourself.",
        "Why are you interested in this position?",
        "What are your greatest strengths?",
        "What is your biggest weakness?",
        "Where do you see yourself in 5 years?"
    ]

    role_specific = {
        "software engineer": [
            "Explain the difference between SQL and NoSQL databases.",
            "How do you approach debugging complex code?",
            "Describe a challenging technical problem you solved.",
            "What's your experience with version control systems?"
        ],
        "data scientist": [
            "Explain the bias-variance tradeoff.",
            "How do you handle missing data in datasets?",
            "Describe a machine learning project you worked on.",
            "What's your experience with A/B testing?"
        ],
        "product manager": [
            "How do you prioritize features for a product?",
            "Describe a time you handled conflicting stakeholder requirements.",
            "How do you measure product success?",
            "Walk me through your product launch process."
        ],
        "marketing": [
            "How do you measure campaign success?",
            "Describe your experience with digital marketing tools.",
            "How do you identify target audiences?",
            "What's your approach to content strategy?"
        ]
    }

    role_lower = job_role.lower()
    specific_questions = []

    for key, questions in role_specific.items():
        if key in role_lower:
            specific_questions = questions
            break

    if not specific_questions:
        specific_questions = [
            "What relevant experience do you have for this role?",
            "How do you stay updated in your field?",
            "Describe a challenge you faced and how you overcame it."
        ]

    result = f"Interview Questions for {job_role}:\n\n"
    result += "Common Questions:\n"
    for i, q in enumerate(common_questions, 1):
        result += f"{i}. {q}\n"

    result += f"\n{job_role}-Specific Questions:\n"
    for i, q in enumerate(specific_questions, 1):
        result += f"{i}. {q}\n"

    return result

def evaluate_interview_answer(question_and_answer: str) -> str:
    """Evaluates an interview answer using the STAR method."""
    parts = question_and_answer.split("|")
    if len(parts) != 2:
        return "Please format as: 'Question: your question | Answer: your answer'"

    question = parts[0].replace("Question:", "").strip()
    answer = parts[1].replace("Answer:", "").strip()

    feedback = []
    answer_lower = answer.lower()

    has_situation = any(word in answer_lower for word in ["when", "situation", "time", "once"])
    has_task = any(word in answer_lower for word in ["needed", "had to", "task", "goal"])
    has_action = any(word in answer_lower for word in ["i did", "i implemented", "i created"])
    has_result = any(word in answer_lower for word in ["result", "outcome", "achieved"])

    feedback.append("STAR Method Analysis:")
    feedback.append(f"Situation: {'Present' if has_situation else 'Missing'}")
    feedback.append(f"Task: {'Present' if has_task else 'Missing'}")
    feedback.append(f"Action: {'Present' if has_action else 'Missing'}")
    feedback.append(f"Result: {'Present' if has_result else 'Missing'}")

    word_count = len(answer.split())
    if word_count < 50:
        feedback.append("\nAnswer is too brief. Aim for 150-250 words.")
    elif word_count > 300:
        feedback.append("\nAnswer might be too long. Keep it concise.")
    else:
        feedback.append("\nGood length.")

    has_numbers = any(char.isdigit() for char in answer)
    if has_numbers:
        feedback.append("Includes specific numbers/metrics.")
    else:
        feedback.append("Add quantifiable results.")

    return "\n".join(feedback)

def analyze_job_description(job_description: str) -> str:
    """Analyzes a job description and extracts key requirements."""
    jd_lower = job_description.lower()

    common_skills = [
        "python", "java", "javascript", "sql", "aws", "communication",
        "leadership", "project management", "agile"
    ]

    found_skills = [skill for skill in common_skills if skill in jd_lower]

    analysis = f"""
Job Description Analysis:

KEY SKILLS REQUIRED:
{chr(10).join(f"- {skill.title()}" for skill in found_skills) if found_skills else "- No specific skills clearly listed"}

ACTION ITEMS:
1. Tailor your resume to highlight these skills: {', '.join(found_skills[:5])}
2. Prepare STAR method examples for these competencies
3. Research the company culture and values
4. Prepare questions about role specifics
"""
    return analysis

#Wrapping function as tools

tools = [
    Tool(
        name="VideoTranscriptSearch",
        func=rag_search,
        description="Searches transcribed videos about job applications, interviews, and career advice. Use this to find expert advice from video content."
    ),
    Tool(
        name="ResumeAnalyzer",
        func=analyze_resume,
        description="Analyzes a resume and provides improvement suggestions. Input should be the resume text."
    ),
    Tool(
        name="InterviewQuestionGenerator",
        func=generate_interview_questions,
        description="Generates interview questions for a job role. Input should be the job title (e.g., 'Software Engineer')."
    ),
    Tool(
        name="MockInterviewEvaluator",
        func=evaluate_interview_answer,
        description="Evaluates interview answers using STAR method. Format: 'Question: [question] | Answer: [answer]'."
    ),
    Tool(
        name="JobDescriptionAnalyzer",
        func=analyze_job_description,
        description="Analyzes job descriptions and extracts key requirements. Input should be the job description text."
    ),
]

# LLM

llm = ChatOpenAI(
    model="gpt-4-turbo-preview",
    temperature=0.7,
)

# Agent prompt

prompt = ChatPromptTemplate.from_messages([
    ("system", """You are an expert Job Application and Interview Coach.

Available tools:
- VideoTranscriptSearch: Find expert advice from transcribed videos
- ResumeAnalyzer: Analyze and improve resumes
- InterviewQuestionGenerator: Generate relevant interview questions
- MockInterviewEvaluator: Evaluate practice interview answers
- JobDescriptionAnalyzer: Analyze job postings

Be supportive, specific, and actionable in your advice."""),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}"),
    MessagesPlaceholder(variable_name="agent_scratchpad"),
])

# Create memory

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

# Create agent with tools

agent = create_openai_functions_agent(
    llm=llm,
    tools=tools,
    prompt=prompt
)

# Create agent executor with tools

agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    memory=memory,
    verbose=True,
    handle_parsing_errors=True,
    max_iterations=5
)

# Usage of agent

def ask(question: str):
    """Ask a question to the job coach agent."""
    result = agent_executor.invoke({"input": question})
    answer = result["output"]

    print(f"\n{'='*60}")
    print(f"Question: {question}")
    print(f"{'='*60}")
    print(f"Answer:\n{answer}")
    print(f"{'='*60}\n")

    return answer

In [None]:
#Tools

@tool
def youtube_rag_qa(question: str) -> str:
    """Answer questions about the YouTube videos that have been ingested into the vector store."""
    answer, _ctx = youtube_rag_query(question)
    return answer

@tool
def transcribe_audio_file(audio_path: str) -> str:
    """Transcribe an audio file at a local path to text."""
    return transcribe_audio_to_text(audio_path)

tools = [youtube_rag_qa, transcribe_audio_file]
tool_map = {t.name: t for t in tools}

# LLM that can call tools
tool_llm = llm.bind_tools(tools)

# ---- Conversation memory (per session) ----

session_history: Dict[str, List[BaseMessage]] = defaultdict(list)

SYSTEM_PROMPT = (
    "You are an assistant that answers questions about a set of YouTube videos "
    "that have already been ingested into a vector database.\n"
    "If the user asks about the content of the videos, you should use the "
    "`youtube_rag_qa` tool.\n"
    "If the user gives or refers to an audio file path, you can use the "
    "`transcribe_audio_file` tool to turn it into text and then use "
    "`youtube_rag_qa` with the transcribed question.\n"
    "Always give concise, helpful answers."
)

def agent_chat(user_input: str, session_id: str = "default") -> str:
    """
    Simple agent:
    - Uses OpenAI tool-calling to decide whether to call youtube_rag_qa / transcribe_audio_file
    - Keeps per-session memory of previous turns
    """
    history = session_history[session_id]

    # 1) Build the message list: system + history + new user message
    messages: List[BaseMessage] = [
        SystemMessage(content=SYSTEM_PROMPT),
        *history,
        HumanMessage(content=user_input),
    ]

    # 2) First model call: the model may decide to call tools
    ai_msg = tool_llm.invoke(messages)
    messages.append(ai_msg)

    # If the model already answered without tools, just return that
    if not getattr(ai_msg, "tool_calls", None):
        history.extend([HumanMessage(content=user_input), ai_msg])
        return ai_msg.content

    # 3) If there are tool calls, execute them
    tool_messages: List[ToolMessage] = []
    for call in ai_msg.tool_calls:
        tool_name = call["name"]
        tool_args = call["args"]
        tool_id = call["id"]

        tool = tool_map.get(tool_name)
        if tool is None:
            tool_result = f"Error: tool '{tool_name}' not found."
        else:
            # invoke() expects a dict of arguments for the tool
            tool_result = tool.invoke(tool_args)

        tool_messages.append(
            ToolMessage(
                content=str(tool_result),
                tool_call_id=tool_id,
            )
        )

    # 4) Second model call: model sees tool results and produces final answer
    messages.extend(tool_messages)
    final_ai = llm.invoke(messages)

    # 5) Update history with just the human input and final answer
    history.extend([
        HumanMessage(content=user_input),
        final_ai,
    ])

    return final_ai.content


Test the RAG pipeline

In [None]:
answer, used_context = youtube_rag_query("Give me an overview of the videos you indexed.")

print("=== ANSWER ===\n")
print(answer)

print("\n=== CONTEXT USED ===\n")
print(used_context)

=== ANSWER ===

The indexed videos focus on job interview preparation and strategies. Here's an overview of the content:

1. **Interview Preparation**: The videos emphasize the importance of being well-prepared for interviews, including having a list of thoughtful questions to ask the employer. Suggested questions include inquiries about the company culture, goals, and performance evaluation.

2. **Positive Framing**: Candidates are advised to avoid speaking negatively about previous employers. Instead, they should highlight what they learned from past experiences and how those experiences can benefit the new role.

3. **Body Language and Etiquette**: The videos stress the significance of body language during interviews. Candidates should maintain good posture, make eye contact, and be aware of their movements to convey confidence and presence.

4. **Answering Common Questions**: Tips are provided on how to answer common interview questions effectively. Candidates are encouraged to be 

Chat loop



In [None]:
def chat_with_youtube_bot(session_id: str = "default"):
    """
    Simple text-based chat loop in Colab, powered by the tool-calling agent + memory.
    """
    print("YouTube QA ChatBot (agent powered)")
    print("Ask me anything about the videos I have indexed.")
    print("Type 'exit' or 'quit' to end the chat.\n")

    while True:
        user_input = input("You: ").strip()

        if user_input.lower() in {"exit", "quit"}:
            print("Bot: Bye!")
            break

        if not user_input:
            continue

        # ✅ Use the agent (tools + memory), not the bare RAG function
        answer = agent_chat(user_input, session_id=session_id)
        print(f"Bot: {answer}\n")


# Run this to start chatting:
chat_with_youtube_bot()


YouTube QA ChatBot (agent powered)
Ask me anything about the videos I have indexed.
Type 'exit' or 'quit' to end the chat.

You: what should I not do?
Bot: In an interview, you should avoid speaking negatively about previous employers and instead focus on what you've learned. Be mindful of your body language; avoid distracting movements like tapping your fingers. Also, prepare a few questions for your employer to show your interest and preparation.

You: exit
Bot: Bye!


Speech recognition

In [None]:
def transcribe_audio_to_text(audio_path: str, model: str = "gpt-4o-mini-transcribe") -> str:
    """
    Transcribe an audio file to text using OpenAI's speech recognition.
    Adjust model name if needed depending on what your account supports.
    """
    with open(audio_path, "rb") as audio_file:
        transcription = openai_client.audio.transcriptions.create(
            model=model,
            file=audio_file,
            response_format="text",
        )
    # For newer clients this is already a string; if it's an object, cast to str
    return str(transcription)

def ask_bot_with_audio(session_id: str = "voice-session"):
    """
    Colab helper to:
    1. Upload an audio file with a spoken question
    2. Use the agent (via answer_from_audio_file) to get transcript + answer
    3. Print both transcript and answer
    """
    print("🎙️ Upload an audio file (e.g., .wav, .mp3) with your question...")
    uploaded = files.upload()

    if not uploaded:
        print("No file uploaded.")
        return

    audio_filename = next(iter(uploaded.keys()))
    print(f"📁 Uploaded file: {audio_filename}")

    try:
        # This already uses agent_chat under the hood
        transcript_text, answer = answer_from_audio_file(audio_filename, session_id=session_id)
    except Exception as e:
        print("❌ Error while processing the audio:", e)
        return

    print("\n📝 Transcribed question:")
    print(transcript_text)

    print("\n🤖 Bot answer:")
    print(answer)

    print("\n Transcribed question:")
    print(transcript_text)

    # 2) Ask the RAG bot
    try:
        answer, _context = youtube_rag_query(transcript_text, session_id=session_id)
    except TypeError:
        # Fallback if youtube_rag_query only expects (question)
        answer, _context = youtube_rag_query(transcript_text)

    print("\n🤖 Bot answer:")
    print(answer)

def tts_from_text(text: str, output_dir: str = "tts_outputs") -> str:
    """
    Convert answer text to speech using OpenAI TTS and return the audio file path.
    """
    os.makedirs(output_dir, exist_ok=True)
    filename = f"answer_{uuid.uuid4().hex}.mp3"
    out_path = os.path.join(output_dir, filename)

    # Adjust 'model' if needed to whatever TTS model you have access to
    speech = openai_client.audio.speech.create(
        model="gpt-4o-mini-tts",  # or another TTS-capable model from your OpenAI account
        voice="alloy",
        input=text,
    )

    with open(out_path, "wb") as f:
        f.write(speech.read())

    return out_path


In [None]:
ask_bot_with_audio()

🎙️ Upload an audio file (e.g., .wav, .mp3) with your question...


KeyboardInterrupt: 

In [None]:
def answer_from_audio_file(audio_path: str, session_id: str = "voice-session"):
    """
    Core backend logic:
    1. Transcribe audio file at `audio_path` to text
    2. Run the text through the agent (which internally uses RAG + tools + memory)
    3. Return (transcript, answer)
    """
    transcript_text = transcribe_audio_to_text(audio_path)
    answer = agent_chat(transcript_text, session_id=session_id)
    return transcript_text, answer


Change for Gradio later

In [None]:
def gradio_audio_qa(audio_path, tts_enabled: bool):
    """
    Gradio backend:
    - Takes an audio question (mic or upload)
    - Uses the agent-backed pipeline to get (transcript, answer)
    - Optionally generates spoken answer audio if tts_enabled is True
    """
    if audio_path is None:
        return "", "No audio received.", None

    # This already calls agent_chat under the hood
    transcript, answer = answer_from_audio_file(audio_path)

    answer_audio_path = None
    if tts_enabled:
        try:
            answer_audio_path = tts_from_text(answer)
        except Exception as e:
            print("TTS error:", e)

    return transcript, answer, answer_audio_path


with gr.Blocks() as demo:
    gr.Markdown("# 🎬 YouTube QA Bot with Voice Input")

    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone"],   # 👈 mic only
            type="filepath",          # pass a file path to backend
            label="Hold to record your question",
        )
        tts_toggle = gr.Checkbox(
            label="Read answer aloud",
            value=False,
        )

    transcript_output = gr.Textbox(
        label="Transcribed question",
        lines=2,
    )
    answer_output = gr.Textbox(
        label="Bot answer",
        lines=4,
    )
    answer_audio_output = gr.Audio(
        label="Spoken answer",
        type="filepath",
    )

    audio_input.change(
        fn=gradio_audio_qa,
        inputs=[audio_input, tts_toggle],
        outputs=[transcript_output, answer_output, answer_audio_output],
    )

demo.launch()