In [2]:
# ===============================
# STEP 0: Install Libraries
# ===============================
!pip install -q youtube-transcript-api langchain langchain-community langchain-openai faiss-cpu tiktoken python-dotenv


In [6]:
# ===============================
# STEP 0.1: Import & API Key
# ===============================
import os

os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"  # üî¥ yaha apni key daalo


In [1]:
!pip uninstall -y youtube-transcript-api
!pip install --upgrade youtube-transcript-api


Found existing installation: youtube-transcript-api 1.2.3
Uninstalling youtube-transcript-api-1.2.3:
  Successfully uninstalled youtube-transcript-api-1.2.3
Collecting youtube-transcript-api
  Using cached youtube_transcript_api-1.2.3-py3-none-any.whl.metadata (24 kB)
Using cached youtube_transcript_api-1.2.3-py3-none-any.whl (485 kB)
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-1.2.3


In [3]:
!pip install pytube



Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m57.6/57.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0


In [4]:
from pytube import YouTube

yt = YouTube("https://www.youtube.com/watch?v=ILrYwPd1Dc")

caption = yt.captions.get_by_language_code('en')
if caption:
    text = caption.generate_srt_captions()
    print(text[:1000])
else:
    print("‚ùå English captions available nahi hain")


RegexMatchError: regex_search: could not find match for (?:v=|\/)([0-9A-Za-z_-]{11}).*

In [None]:
# ===============================
# STEP 2: Split Text into Chunks
# ===============================
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

documents = text_splitter.create_documents([transcript_text])

print(f"Total Chunks: {len(documents)}")
print(documents[0])


In [None]:
# ===============================
# STEP 3: Create Embeddings & Store in FAISS
# ===============================
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

vector_store = FAISS.from_documents(documents, embeddings)


In [None]:
# ===============================
# STEP 4: Create Retriever
# ===============================
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4}
)

retriever


In [None]:
# ===============================
# STEP 5: Prompt Template
# ===============================
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
    template="""
You are a helpful assistant.
Answer ONLY from the provided transcript context.
If the context is insufficient, say "I don't know".

Context:
{context}

Question:
{question}
""",
    input_variables=["context", "question"]
)


In [None]:
# ===============================
# STEP 6: LLM
# ===============================
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.2
)


In [None]:
# ===============================
# STEP 7: RAG Chain
# ===============================
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    RunnableParallel({
        "context": retriever | RunnableLambda(format_docs),
        "question": RunnablePassthrough()
    })
    | prompt
    | llm
    | StrOutputParser()
)


In [None]:
# ===============================
# STEP 8: Ask Questions
# ===============================
rag_chain.invoke("Who is Demis Hassabis?")


In [None]:
rag_chain.invoke("Is nuclear fusion discussed in this video?")


In [None]:
rag_chain.invoke("Can you summarize the video?")


In [None]:
YouTube Video
   ‚Üì
Transcript
   ‚Üì
Chunking
   ‚Üì
Embeddings
   ‚Üì
FAISS Vector DB
   ‚Üì
Retriever
   ‚Üì
Prompt Augmentation
   ‚Üì
LLM
   ‚Üì
Answer
