In [1]:
#### Building a RAG application from scratch

In [3]:
# Let's start by loading the environment variables we need to use.

In [5]:
import os
from dotenv import load_dotenv
os.environ["OPENAI_API_KEY"] = "secretkey here"

In [7]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# This is the YouTube video we're going to use.
YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=cdiD-9MMpb0"

In [9]:
# Setting up the model
# Let's define the LLM model that we'll use as part of the workflow.

In [11]:
from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

In [12]:
model.invoke("Which cricket team won the 2024 t20 world cup?")

AIMessage(content='I am sorry, but as of now, it is not possible to predict the winner of the 2024 T20 World Cup as it is a future event.', response_metadata={'token_usage': {'completion_tokens': 33, 'prompt_tokens': 20, 'total_tokens': 53}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-417bb246-9a1a-45ca-9695-842125d9b052-0', usage_metadata={'input_tokens': 20, 'output_tokens': 33, 'total_tokens': 53})

In [13]:
model.invoke("list cricket teams that has the most world cups?")

AIMessage(content='1. Australia - 5 World Cups (1987, 1999, 2003, 2007, 2015)\n2. India - 2 World Cups (1983, 2011)\n3. West Indies - 2 World Cups (1975, 1979)\n4. Pakistan - 1 World Cup (1992)\n5. Sri Lanka - 1 World Cup (1996)\n6. England - 1 World Cup (2019)', response_metadata={'token_usage': {'completion_tokens': 98, 'prompt_tokens': 17, 'total_tokens': 115}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-868d5ec5-9225-4662-b1b0-0c22ee2cc141-0', usage_metadata={'input_tokens': 17, 'output_tokens': 98, 'total_tokens': 115})

In [16]:
# Here is what chaining the model with an output parser looks like:

In [17]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser
chain.invoke("list cricket teams that has the most world cups?")

'1. Australia - 5 World Cups (1987, 1999, 2003, 2007, 2015)\n2. West Indies - 2 World Cups (1975, 1979)\n3. India - 2 World Cups (1983, 2011)\n4. Pakistan - 1 World Cup (1992)\n5. Sri Lanka - 1 World Cup (1996)\n6. England - 1 World Cup (2019)'

In [21]:
# Introducing prompt templates
# We want to provide the model with some context and the question. 
# Prompt templates are a simple way to define and reuse prompts.

In [23]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(context="Smruthy's sister is Swetha", question="Who is Smruthy's sister?")

'Human: \nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Smruthy\'s sister is Swetha\n\nQuestion: Who is Smruthy\'s sister?\n'

In [25]:
# We can now chain the prompt with the model and the output parser.

In [27]:
chain = prompt | model | parser
chain.invoke({
    "context": "Smruthy's sister is Swetha",
    "question": "Who is Smruthy's sister?"
})

'Swetha'

In [28]:
# Combining chains
# We can combine different chains to create more complex workflows. 
# ------ For example, let's create a second chain that translates the answer from the first chain into a different language.
# Let's start by creating a new prompt template for the translation chain:

In [31]:
translation_prompt = ChatPromptTemplate.from_template(
    "Translate {answer} to {language}"
)

In [33]:
from operator import itemgetter

translation_chain = (
    {"answer": chain, "language": itemgetter("language")} | translation_prompt | model | parser
)

translation_chain.invoke(
    {
        "context": "Smruthy's sister is Swetha. She doesn't have any more siblings.",
        "question": "How many sisters does Smruthy have?",
        "language": "Spanish",
    }
)

'Smruthy tiene una hermana.'

In [35]:
# Okay ! Lets get real - lets dive into the project.

In [37]:
# Transcribing the YouTube Video
# The context we want to send the model comes from a YouTube video. Let's download the video and transcribe it using OpenAI's Whisper.

In [39]:
# This below code did not work for me - :( [But i would suggest to give it a try]

In [41]:
# import tempfile
# import whisper
# from pytube import YouTube

# # Let's do this only if we haven't created the transcription file yet.
# if not os.path.exists("transcription.txt"):
#     youtube = YouTube(YOUTUBE_VIDEO)
#     audio = youtube.streams.filter(only_audio=True).first()

#     # Let's load the base model. This is not the most accurate
#     # model but it's fast.
#     whisper_model = whisper.load_model("base")

#     with tempfile.TemporaryDirectory() as tmpdir:
#         file = audio.download(output_path=tmpdir)
#         transcription = whisper_model.transcribe(file, fp16=False)["text"].strip()

#         with open("transcription.txt", "w") as file:
#             file.write(transcription)

In [43]:
# pip install yt-dlp

In [47]:
# Please run the below code which is very important to get the audio file for transcrition
# ---- I have already run it - so i am not re running this ----

In [49]:
# !yt-dlp -f "bestvideo[height<=1080][ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" "https://www.youtube.com/watch?v=cdiD-9MMpb0"

In [None]:
# Remove the specific ffmpeg entry
if 'ffmpeg' in os.environ:
    del os.environ['ffmpeg']

In [None]:
# Set ffmpeg environment using os.environ

In [None]:
ffmpeg = r"/opt/homebrew/Cellar/ffmpeg/7.0.1/bin" 
os.environ["PATH"] += os.pathsep + ffmpeg

In [None]:
# This worked for me instead of the pytube package

In [None]:
import os
import whisper

model = whisper.load_model("base")

# Path to your downloaded audio file
audio_file = "/Users/hariev/Andrej Karpathy： Tesla AI, Self-Driving, Optimus, Aliens, and AGI ｜ Lex Fridman Podcast #333 [cdiD-9MMpb0].f140.m4a" 
# Transcribe the audio
result = model.transcribe(audio_file)

# Write the transcription to a file
with open("transcription.txt", "w", encoding="utf-8") as file:
    file.write(result["text"])

print("Transcription complete. Output saved to 'transcription.txt'")

In [51]:
with open("transcription.txt") as file:
    transcription = file.read()

transcription[:100]

" I think it's possible that physics has exploits and we should be trying to find them. arranging som"

In [53]:
# Using the entire transcription as context
# If we try to invoke the chain using the transcription as context, 
# the model will return an error because the context is too long.

# Large Language Models support limitted context sizes. 
# The video we are using is too long for the model to handle, so we need to find a different solution.

In [55]:
try:
    chain.invoke({
        "context": transcription,
        "question": "Is reading papers a good idea?"
    })
except Exception as e:
    print(e)

Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 43788 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


In [57]:
# Splitting the transcription
# Since we can't use the entire transcription as the context for the model, 
#    a potential solution is to split the transcription into smaller chunks. 
#    We can then invoke the model using only the relevant chunks to answer a particular question:

In [59]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("transcription.txt")
text_documents = loader.load()
text_documents

[Document(metadata={'source': 'transcription.txt'}, page_content=" I think it's possible that physics has exploits and we should be trying to find them. arranging some kind of a crazy quantum mechanical system that somehow gives you buffer overflow, somehow gives you a rounding error in the floating point. Synthetic intelligences are kind of like the next stage of development. And I don't know where it leads to. Like at some point, I suspect the universe is some kind of a puzzle. These synthetic AIs will uncover that puzzle and solve it. The following is a conversation with Andre Kappathi, previously the director of AI at Tesla. And before that, at OpenAI and Stanford, he is one of the greatest scientist engineers and educators in the history of artificial intelligence. This is the Lex Friedman podcast to support it. Please check out our sponsors and now to your friends. Here's Andre Kappathi. What is a neural network and what does it seem to do such a surprisingly good job of learning

In [61]:
# There are many different ways to split a document. 
# For this example, we'll use a simple splitter that splits the document into chunks of a fixed size. 
# Check Text Splitters for more information about different approaches to splitting documents.

# For illustration purposes, let's split the transcription into chunks of 100 characters with an overlap of 20 characters and display the first few chunks:

In [63]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
text_splitter.split_documents(text_documents)[:5]

[Document(metadata={'source': 'transcription.txt'}, page_content="I think it's possible that physics has exploits and we should be trying to find them. arranging"),
 Document(metadata={'source': 'transcription.txt'}, page_content='them. arranging some kind of a crazy quantum mechanical system that somehow gives you buffer'),
 Document(metadata={'source': 'transcription.txt'}, page_content='gives you buffer overflow, somehow gives you a rounding error in the floating point. Synthetic'),
 Document(metadata={'source': 'transcription.txt'}, page_content="point. Synthetic intelligences are kind of like the next stage of development. And I don't know"),
 Document(metadata={'source': 'transcription.txt'}, page_content="And I don't know where it leads to. Like at some point, I suspect the universe is some kind of a")]

In [65]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(text_documents)

In [67]:
# Finding the relevant chunks
# Given a particular question, we need to find the relevant chunks from the transcription to send to the model. Here is where the idea of embeddings comes into play.

# An embedding is a mathematical representation of the semantic meaning of a word, sentence, or document. It's a projection of a concept in a high-dimensional space. Embeddings have a simple characteristic: The projection of related concepts will be close to each other, while concepts with different meanings will lie far away. You can use the Cohere's Embed Playground to visualize embeddings in two dimensions.

# To provide with the most relevant chunks, we can use the embeddings of the question and the chunks of the transcription to compute the similarity between them. We can then select the chunks with the highest similarity to the question and use them as the context for the model:

In [69]:
## Let's generate embeddings for an arbitrary query:

In [71]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
embedded_query = embeddings.embed_query("Who is Smruthy's sister?")

print(f"Embedding length: {len(embedded_query)}")
print(embedded_query[:10])

Embedding length: 1536
[0.009383711963891983, -0.006260295864194632, -0.003735983045771718, -0.0025916278827935457, -0.016357546672225, 0.03387291356921196, -0.013213936239480972, 0.006462240591645241, -0.004166799131780863, -0.024556515738368034]


In [73]:
# To illustrate how embeddings work, let's first generate the embeddings for two different sentences:

In [75]:
sentence1 = embeddings.embed_query("Smruthy's sister is Swetha")
sentence2 = embeddings.embed_query("Shriya's mother is a teacher")

In [77]:
# We can now compute the similarity between the query and each of the two sentences. 
# The closer the embeddings are, the more similar the sentences will be.

# We can use Cosine Similarity to calculate the similarity between the query and each of the sentences:

In [79]:
from sklearn.metrics.pairwise import cosine_similarity

query_sentence1_similarity = cosine_similarity([embedded_query], [sentence1])[0][0]
query_sentence2_similarity = cosine_similarity([embedded_query], [sentence2])[0][0]

query_sentence1_similarity, query_sentence2_similarity

(0.9503050278590504, 0.8173095674282653)

In [81]:
# Setting up a Vector Store
# We need an efficient way to store document chunks, their embeddings, and perform similarity searches at scale. To do this, we'll use a vector store.

# A vector store is a database of embeddings that specializes in fast similarity searches.

In [83]:
# To understand how a vector store works, let's create one in memory and add a few embeddings to it:

In [85]:
# Again I am using anaconda python this "DocArrayInMemorySearch" did not work for me.

In [87]:
# from langchain_community.vectorstores import DocArrayInMemorySearch

# vectorstore1 = DocArrayInMemorySearch.from_texts(
#     [
#         "Smruthy's sister is Swetha",
#         "John and Tommy are brothers",
#         "Patricia likes white cars",
#         "Shriya's mother is a teacher",
#         "Hari drives an Baleno",
#         "Mary has two siblings",
#     ],
#     embedding=embeddings,
# )

In [89]:
# This workedc for me FAISS
# need to  -- pip install faiss-cpu --
# as the previous code was throwing tensorflow._api error. [But do try with the above one too]

In [91]:
from langchain_community.vectorstores import FAISS

vectorstore1 = FAISS.from_texts(
    [
        "Smruthy's sister is Swetha",
        "John and Tommy are brothers",
        "Patricia likes white cars",
        "Shriya's mother is a teacher",
        "Hari drives an Baleno",
        "Smru has two siblings",
    ],
    embedding=embeddings,
)

In [92]:
vectorstore1.similarity_search_with_score(query="Who is Smruthy's sister?", k=3)

[(Document(page_content="Smruthy's sister is Swetha"), 0.09938994),
 (Document(page_content='Smru has two siblings'), 0.29085883),
 (Document(page_content="Shriya's mother is a teacher"), 0.36538088)]

In [93]:
# Connecting the vector store to the chain
# We can use the vector store to find the most relevant chunks from the transcription to send to the model. Here is how we can connect the vector store to the chain:

In [94]:
# We need to configure a Retriever. 
# The retriever will run a similarity search in the vector store and return the most similar documents back to the next step in the chain.
# We can get a retriever directly from the vector store we created before:

In [99]:
retriever1 = vectorstore1.as_retriever()
retriever1.invoke("Who is Smruthy's sister?")

[Document(page_content="Smruthy's sister is Swetha"),
 Document(page_content='Smru has two siblings'),
 Document(page_content="Shriya's mother is a teacher"),
 Document(page_content='John and Tommy are brothers')]

In [101]:
# Our prompt expects two parameters, "context" and "question." 
# We can use the retriever to find the chunks we'll use as the context to answer the question.

# We can create a map with the two inputs by using the RunnableParallel and RunnablePassthrough classes. This will allow us to pass the context and question to the prompt as a map with the keys "context" and "question."

In [103]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(context=retriever1, question=RunnablePassthrough())
setup.invoke("What color is Patricia's car?")

{'context': [Document(page_content='Patricia likes white cars'),
  Document(page_content='Hari drives an Baleno'),
  Document(page_content='John and Tommy are brothers'),
  Document(page_content="Shriya's mother is a teacher")],
 'question': "What color is Patricia's car?"}

In [105]:
# Let's now add the setup map to the chain and run it:

In [107]:
chain = setup | prompt | model | parser
chain.invoke("What color is Patricia's car?")

'White'

In [108]:
chain.invoke("What car does Hari drive?")

'Hari drives a Baleno.'

In [109]:
# Loading transcription into the vector store
# We initialized the vector store with a few random strings. Let's create a new vector store using the chunks from the video transcription.

In [115]:
vectorstore2 = FAISS.from_documents(documents, embeddings)

In [117]:
chain = (
    {"context": vectorstore2.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("What is synthetic intelligence?")

'Synthetic intelligence is described as the next stage of development and is believed to uncover and solve the puzzle of the universe.'

In [None]:
# Setting up Pinecone
# So far we've used an in-memory vector store. In practice, we need a vector store that can handle large amounts of data and perform similarity searches at scale. For this example, we'll use Pinecone.

# The first step is to create a Pinecone account, set up an index, get an API key, and set it as an environment variable PINECONE_API_KEY.
# Then, we can load the transcription documents into Pinecone:

In [119]:
os.environ["PINECONE_API_KEY"] = "secretkeyhere"

In [121]:
from langchain_pinecone import PineconeVectorStore

index_name = "youtube-rag-index"

pinecone = PineconeVectorStore.from_documents(
    documents, embeddings, index_name=index_name
)

In [122]:
pinecone

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x176aa6210>

In [125]:
pinecone.similarity_search("explain me something about masala dosa?")[:3]

[Document(metadata={'source': 'transcription.txt'}, page_content="do you do take in this diet of like what the hell is happening in the world first. I am I do find it interesting to know about the world I don't know that it's useful or good but it is part of my routine right now so I do read through a bunch of news articles and I want to be informed and I'm suspicious of it I'm suspicious of the practice but currently that's where I am. Oh you mean suspicious about the positive effect of that practice on your productivity and your well-being is my well-being psychologically and also on your ability to deeply understand the world because there's a bunch of sources of information you're not really focused on deeply integrating that slowly distracting you're yeah in terms of a perfectly productive day for how long of a stretch of time in one session do you try to work and focus anything a couple hours is it one hours at 30 minutes is 10 minutes I can probably go like a small few hours and

In [127]:
chain = (
    {"context": pinecone.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

chain.invoke("What is Hollywood going to start doing?")

'Hollywood is going to start making movies like Avatar for under a million dollars, potentially generated completely automatedly.'

In [129]:
chain = (
    {"context": pinecone.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

chain.invoke("How big is Anant Amabai wedding?")

"I don't know."

In [131]:
# SO i pass a set of data into pinecone as embeddings regading the news and articles from Anant Amabani wedding - mostly the framework will answer the above question.

In [133]:
# Cheers :)