In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Pinecone (vector db, latest official package)
!pip install --upgrade pinecone

In [None]:
!pip install --upgrade "langchain>=0.2.0" langchain-openai langchain-pinecone


In [None]:
!pip install --upgrade langchain-experimental


In [None]:
!pip install git+https://github.com/openai/whisper.git


In [None]:
!pip install --upgrade yt-dlp


In [None]:
!pip install tqdm pydub


In [None]:
!pip install --upgrade numpy


In [7]:
from google.colab import userdata
import os

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API_KEY")
os.environ["PINECONE_ENVIRONMENT"] = "us-east1-aws"

In [None]:
import yt_dlp
import whisper
import os
from urllib.parse import urlparse, parse_qs

def get_youtube_video_id(url):
    # Handles standard YouTube URLs
    query = urlparse(url)
    if query.hostname in ['www.youtube.com', 'youtube.com']:
        return parse_qs(query.query)['v'][0]
    elif query.hostname == 'youtu.be':
        return query.path[1:]
    else:
        raise ValueError("Invalid YouTube URL")

def download_and_transcribe_youtube(youtube_url, output_dir="/content/drive/MyDrive/github/youtube-rag/rag"):
    video_id = get_youtube_video_id(youtube_url)
    output_filename = os.path.join(output_dir, f'audio_{video_id}.%(ext)s')
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': output_filename,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'keepvideo': False,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])
    audio_path = os.path.join(output_dir, f"audio_{video_id}.mp3")

    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    transcript = result['text']

    transcript_path = os.path.join(output_dir, f"transcript_{video_id}.txt")
    with open(transcript_path, "w") as f:
        f.write(transcript)
    return transcript, transcript_path


In [None]:
youtube_url = "https://www.youtube.com/watch?v=HgcoFVqG0ms"
transcript, transcript_path = download_and_transcribe_youtube(youtube_url)
print(f"Transcript saved at: {transcript_path}")


In [8]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings


In [9]:
with open("/content/drive/MyDrive/github/youtube-rag/rag/transcript_HgcoFVqG0ms.txt") as f:
    text = f.read()


In [10]:
text_splitter = SemanticChunker(
    OpenAIEmbeddings(),                  # uses your OpenAI key automatically
    breakpoint_threshold_type="percentile",    # can also be 'standard_deviation', 'interquartile', or 'gradient'
    breakpoint_threshold_amount=95.0           # default is 95, tune this percentile if you want more/fewer splits
)


In [None]:
docs = text_splitter.create_documents([text])
print(f"Total semantic chunks: {len(docs)}")
for i, d in enumerate(docs):
    print(f"\n--- Chunk {i+1} ---\n{d.page_content[:300]}")  # Show first 300 chars for preview


In [12]:
import os
from pinecone import Pinecone, ServerlessSpec

index_name = "youtube-rag-index"

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,        # 1536 for OpenAI Embeddings
        metric='cosine',       # Or 'euclidean', but cosine is default for OpenAI
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )


In [None]:
from langchain_pinecone import Pinecone as LangChainPinecone
from langchain_openai.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
vectorstore = LangChainPinecone.from_documents(
    docs,
    embeddings,
    index_name=index_name
)
print("Semantic chunks embedded and uploaded to Pinecone vector index!")


In [14]:
retriever = vectorstore.as_retriever()


In [15]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model_name="gpt-4.1"  # or "gpt-4o" etc.
)


In [22]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableMap, RunnablePassthrough
from langchain_core.output_parsers import JsonOutputParser
# Define prompt for the LLM.
prompt = ChatPromptTemplate.from_messages([
    ("system",
     """You are an expert assistant. Using ONLY the provided context, answer the user's question.
     ALWAYS respond in this JSON format:
     {{
       "answer": (string, best answer to the question from context),
       "citations": (array of strings, exact text of the context/chunks used)
     }}

     Context:
     {context}"""),
    ("human", "{question}")
])


# Build the chain
rag_chain = (
    RunnableMap({
        "context": retriever,
        "question": RunnablePassthrough()
    })  # This builds {"context": retriever(query), "question": query}
    | prompt
    | llm
    | JsonOutputParser()  # Get just the text out if you want
)


In [None]:
query = "What is a LangGraph or an agent according to the video?"
answer = rag_chain.invoke(query)
print("Answer:", answer)
