In [1]:
#!pip install langchain
#!pip install -U langchain-community

#!pip install llama-index pypdf
#!pip install llama-index --upgrade

# Don't bother me with warnings
import warnings # optional, disabling warnings about versions and others
warnings.filterwarnings('ignore') # optional, disabling warnings about versions and others

In [2]:
# Loading a simple PDF with Langchain, straightforward

from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/War-of-the-Worlds.pdf")
book = loader.load()
#Looking at a small extract, one page, and a few hundred characters in that page
page = book[3]
print(page.page_content[1660:2164])


   The Martians seem to have calculated their descent with amazing subtlety--their 
mathematical learning is evidently far in exce ss of ours--and to ha ve carried out their 
prepara- tions with a well-nigh perfect unanimity. Had our instru- ments permitted it, we might have seen the gathering trouble far back in the nineteenth century. Men like 
Schiaparelli watched the red planet--it is odd, by-the-bye, that for count- less centuries 
Mars has been the star of war--but failed to interpret the flu


In [3]:
# Loading a simple docuemnt with LlamaIndex, also straightforward

from llama_index.core import GPTVectorStoreIndex, Document
from pypdf import PdfReader

# Load a specific PDF file
pdf_path = "docs/War-of-the-Worlds.pdf"

# Use PyPDF to extract the text
reader = PdfReader(pdf_path)
pdf_text = ""
for page in reader.pages:
    pdf_text += page.extract_text()

# Create a LlamaIndex Document object from the extracted text
document = Document(text=pdf_text)

# Check the content of the document
print(document.get_text()[8444:8944])  # Print 500 characters to verify


The Martians seem to have calculated their descent with amazing subtlety--their 
mathematical learning is evidently far in exce ss of ours--and to ha ve carried out their 
prepara- tions with a well-nigh perfect unanimity. Had our instru- ments permitted it, we might have seen the gathering trouble far back in the nineteenth century. Men like 
Schiaparelli watched the red planet--it is odd, by-the-bye, that for count- less centuries 
Mars has been the star of war--but failed to interpret the flu


In [4]:
# loading a video file and saving the audio to a text file, with LangChain

import os
import whisper
from yt_dlp import YoutubeDL

# Step 1: Set up the download options
url = "https://www.youtube.com/watch?v=2vkJ7v0x-Fs"
save_dir = "docs/youtube/"
output_template = os.path.join(save_dir, '%(title)s.%(ext)s')

ydl_opts = {
    'format': 'bestaudio/best',
    'outtmpl': output_template,  # Save the file to the specified directory with a title-based name
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'm4a',  # You can change this to mp3 if you prefer
        'preferredquality': '192',
    }],
    'ffmpeg_location': '/opt/homebrew/bin/ffmpeg',  # Specify the location of ffmpeg
}

# Step 2: Download the audio from the YouTube video
with YoutubeDL(ydl_opts) as ydl:
    ydl.download([url])

# Step 3: Find the downloaded file
downloaded_file = [f for f in os.listdir(save_dir) if f.endswith('.m4a')][0]  # Assuming m4a, adjust if using mp3
downloaded_file_path = os.path.join(save_dir, downloaded_file)

# Step 4: Load the Whisper model and transcribe the audio file
model = whisper.load_model("base")  # You can choose 'tiny', 'base', 'small', 'medium', or 'large'
result = model.transcribe(downloaded_file_path)

# Step 5: Adding metadata to the transcript, and saving the transcript to a file so we can use it outside of this program.
class Document:
    def __init__(self, source, text, metadata=None):
        self.source = source
        self.page_content = text
        self.metadata = metadata or {}

# Step 6: Wrap the transcription result in the Document class with metadata
document = Document(
    source=downloaded_file_path,
    text=result['text'], 
    metadata={"source": "youtube", "file_path": downloaded_file_path}
)
#Step 7: Save the transcript to a text file
transcript_file_path = os.path.join(save_dir, 'transcript_w_Langchain.txt')
with open(transcript_file_path, 'w') as f:
    f.write(result['text'])

# Step 8: Print the first 1000 characters of the transcript
print(document.page_content[:1000])

[youtube] Extracting URL: https://www.youtube.com/watch?v=2vkJ7v0x-Fs
[youtube] 2vkJ7v0x-Fs: Downloading webpage
[youtube] 2vkJ7v0x-Fs: Downloading ios player API JSON
[youtube] 2vkJ7v0x-Fs: Downloading web creator player API JSON
[youtube] 2vkJ7v0x-Fs: Downloading player b0557ce3
[youtube] 2vkJ7v0x-Fs: Downloading m3u8 information
[info] 2vkJ7v0x-Fs: Downloading 1 format(s): 251
[download] Destination: docs/youtube/Big Data Architectures.webm
[download] 100% of   22.03MiB in 00:00:02 at 9.68MiB/s     
[ExtractAudio] Destination: docs/youtube/Big Data Architectures.m4a
Deleting original file docs/youtube/Big Data Architectures.webm (pass -k to keep)
 In lesson four, we will go deeper into architectures for big data, and we will take a closer look at some of the most popular big data management systems. First, we're going to look at how the big data management system framework looks, and explore the commonalities that pretty much all the big data systems have, as well as some of the key

In [5]:
# loading a video file and saving the audio to a text file, with LlamaIndex

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import whisper
from yt_dlp import YoutubeDL
from llama_index.core import GPTVectorStoreIndex, Document as LlamaDocument
from llama_index.core.base.embeddings.base import BaseEmbedding
from sentence_transformers import SentenceTransformer
from pydantic import Field

# Step 1: setup the option to download and transcribe YouTube video (same as before)
url = "https://www.youtube.com/watch?v=2vkJ7v0x-Fs"
save_dir = "docs/youtube/"
output_template = os.path.join(save_dir, '%(title)s.%(ext)s')

ydl_opts = {
    'format': 'bestaudio/best',
    'outtmpl': output_template,
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'm4a',
        'preferredquality': '192',
    }],
    'ffmpeg_location': '/opt/homebrew/bin/ffmpeg',
}

# Step 2: Download the audio
with YoutubeDL(ydl_opts) as ydl:
    ydl.download([url])

# Step 3: Find the downloaded file
downloaded_file = [f for f in os.listdir(save_dir) if f.endswith('.m4a')][0]
downloaded_file_path = os.path.join(save_dir, downloaded_file)

# Step 4: Transcribe the audio file using Whisper
model = whisper.load_model("base")
result = model.transcribe(downloaded_file_path)
transcribed_text = result['text']

# Step 5: Use Hugging Face SentenceTransformer for embedding
hf_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Custom embedding class using Hugging Face model
class HuggingFaceEmbedding(BaseEmbedding):
    hf_model: SentenceTransformer = Field(default=None)

    def __init__(self, hf_model):
        super().__init__()
        self.hf_model = hf_model

    def _get_text_embedding(self, text):
        # Return embedding for a single string of text
        return self.hf_model.encode(text, convert_to_numpy=True)

    def _get_text_embeddings(self, texts):
        # Return embeddings for a list of strings
        return self.hf_model.encode(texts, convert_to_numpy=True)
    
    def _get_query_embedding(self, query):
        # Return embedding for a query
        return self._get_text_embedding(query)
    
    async def _aget_query_embedding(self, query):
        # Async version of embedding for a query
        return self._get_text_embedding(query)

# Adding metadata to the transcript, and saving the transcript to a file
class Document:
    def __init__(self, source, text, metadata=None):
        self.source = source
        self.page_content = text
        self.metadata = metadata or {}

# Step 6: Wrap the transcription result in the Document class with metadata
document = Document(
    source=downloaded_file_path,
    text=transcribed_text, 
    metadata={"source": "youtube", "file_path": downloaded_file_path}
)

# Step 7: Save the transcript to a text file
transcript_file_path = os.path.join(save_dir, 'transcript_w_LlamaIndex.txt')
with open(transcript_file_path, 'w') as f:
    f.write(transcribed_text)

print(f"Transcript saved to {transcript_file_path}")

# Step 8: Use the LlamaIndex embedding model with the Hugging Face embedding model to retrieve and print the text
llama_document = LlamaDocument(text=transcribed_text)
embed_model = HuggingFaceEmbedding(hf_model)
index = GPTVectorStoreIndex([llama_document], embed_model=embed_model)

# Directly use the retriever (no LLM required)
retriever = index.as_retriever()

# Perform a query using the retriever
response = retriever.retrieve("What is the video about?") # this could be any question, as all we do below is retrieve the first 1000 characters of the transcript

# Print the first 1000 characters of the response text
if response:
    shortened_response = response[0].node.text[:1000]  # Get the text from the node and limit to 500 characters
    print(f"Shortened Response: {shortened_response}")
else:
    print("No response retrieved")



[youtube] Extracting URL: https://www.youtube.com/watch?v=2vkJ7v0x-Fs
[youtube] 2vkJ7v0x-Fs: Downloading webpage
[youtube] 2vkJ7v0x-Fs: Downloading ios player API JSON
[youtube] 2vkJ7v0x-Fs: Downloading web creator player API JSON
[youtube] 2vkJ7v0x-Fs: Downloading m3u8 information
[info] 2vkJ7v0x-Fs: Downloading 1 format(s): 251
[download] Destination: docs/youtube/Big Data Architectures.webm
[download] 100% of   22.03MiB in 00:00:02 at 8.91MiB/s     
[ExtractAudio] Destination: docs/youtube/Big Data Architectures.m4a
Deleting original file docs/youtube/Big Data Architectures.webm (pass -k to keep)
Transcript saved to docs/youtube/transcript_w_LlamaIndex.txt
Shortened Response:  In lesson four, we will go deeper into architectures for big data, and we will take a closer look at some of the most popular big data management systems. First, we're going to look at how the big data management system framework looks, and explore the commonalities that pretty much all the big data systems ha

## LlamaIndex vs LangChain
For now, both tools may look the same. One difference can easily be seen in LlamaIndex' ability to use a tree structure. The Tree Index allows you to build a hierarchical structure, where documents are segmented into chunks, and each chunk is stored as a node in a tree. This allows you to retrieve documents while keeping the context (the spot in the hierarchy where the data appeared). By contrast, LangChain operates on a flat structure.

In [6]:
# Example tree index structure with llamaindex

#!pip install llama-index-llms-ollama

from llama_index.core import TreeIndex, SimpleDirectoryReader
from llama_index.llms.ollama import Ollama

# Initialize the Ollama LLM with Llama3
llama_llm = Ollama(model="llama3", timeout=500)

# Load a specific PDF file using SimpleDirectoryReader
pdf_path = "docs/802.11ae-2012_2.pdf"
documents = SimpleDirectoryReader(input_files=[pdf_path]).load_data()

# Create a hierarchical Tree Index using Llama3 from Ollama
tree_index = TreeIndex.from_documents(documents, llm=llama_llm)

# Access the storage context from the tree index
storage_context = tree_index.storage_context

# Function to print the tree structure
def print_tree_structure(node_id, level=0):
    try:
        # Retrieve the node object using the node_id from the storage context
        node = storage_context.docstore.get_document(node_id)  # Use the exact UUID as node_id
        
        # Print the node name/text with indentation
        indent = "  " * level  # Adjust indentation
        print(f"{indent}Node ID: {node_id}")
        print(f"{indent}Text: {node.text[:100]}...")  # Print the first 100 characters of the node text
        
        # Recursively print child nodes if they exist
        if hasattr(node, "child_ids") and node.child_ids:
            for child_id in node.child_ids:
                print_tree_structure(child_id, level + 1)
    except ValueError:
        print(f"Node ID {node_id} not found in storage context")

# Access the actual UUID root node IDs from the Tree Index
root_node_ids = list(tree_index.index_struct.root_nodes.values())

# Print the list of root node IDs to confirm
print("Actual Root Node IDs:", root_node_ids)

# Start printing from each root node using the actual UUIDs
for root_node_id in root_node_ids:
    print_tree_structure(root_node_id)


Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)


Actual Root Node IDs: ['aeee403a-4759-4be9-a38b-abe68affbbe1', '91d226c0-cc04-498e-84dd-7c1fd9ec1580']
Node ID: aeee403a-4759-4be9-a38b-abe68affbbe1
Text: IEEE Std 802.11ae-2012 AMENDMENT 1: PRIORITIZATION OF MANAGEMENT FRAMES
28 Copyright © 2012 IEEE. Al...
Node ID: 91d226c0-cc04-498e-84dd-7c1fd9ec1580
Text: AMENDMENT 1: PRIORITIZATION OF MANAGEMENT FRAMES IEEE Std 802.11ae-2012
Copyright © 2012 IEEE. All r...


Could you do the same with LangChain? No, LangChain has a flat indexing structure. You 'could' use a workaround by sotring in the metadata elements such as the page number, and search the document text for elements that could look like a hierarchical element, like numbers for chapter.


In [7]:
from langchain.document_loaders import PyPDFLoader
import re

# Load the PDF
pdf_loader = PyPDFLoader("docs/802.11ae-2012_2.pdf")
documents = pdf_loader.load()

# Function to extract paragraph indices from the text
def extract_paragraph_indices(text):
    # Regular expression pattern to match paragraph numbers like 9.4.2.1, 9.4.2.2
    paragraph_pattern = re.findall(r'\b\d+(\.\d+)+\b', text)
    return paragraph_pattern if paragraph_pattern else None

# Loop through documents to extract metadata
for i, doc in enumerate(documents):
    text = doc.page_content
    paragraph_indices = extract_paragraph_indices(text)
    page_number = i + 1  # assuming pages are in order in 'documents'

    # Add paragraph indices and page metadata
    if paragraph_indices:
        doc.metadata['paragraph_indices'] = paragraph_indices
    doc.metadata['page_number'] = page_number

    # Print the metadata structure for this document
    print(f"Document {i + 1} metadata: {doc.metadata}")



Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)


Document 1 metadata: {'source': 'docs/802.11ae-2012_2.pdf', 'page': 0, 'paragraph_indices': ['.2', '.1', '.122', '.2'], 'page_number': 1}
Document 2 metadata: {'source': 'docs/802.11ae-2012_2.pdf', 'page': 1, 'paragraph_indices': ['.2', '.3', '.4', '.7', '.3', '.3'], 'page_number': 2}


## More on Splitters
In the previous lesson, we focused on recursive character splitter. Regardless of the framework you use, you need to spend some time understanding, and choosing the best splitter for your use case.

In [8]:
# Comparing 2 libraries 
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

chunk_size =20
chunk_overlap = 5

rsplit = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
csplit = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

text1 = 'abcdefghijklmnopqrstuvwxyz1234567890'
text2 = 'a b c d e f g h i j k l m n o p q r s t u v w x y z 1 2 3 4 5 6 7 8 9 0'

In [9]:
rsplit.split_text(text1)

['abcdefghijklmnopqrst', 'pqrstuvwxyz123456789', '567890']

In [10]:
rsplit.split_text(text2)

['a b c d e f g h i j',
 'i j k l m n o p q r',
 'q r s t u v w x y z',
 'y z 1 2 3 4 5 6 7 8',
 '7 8 9 0']

In [11]:
# Character splitter does not do anything, because it considers by default the end of paragraph as the separator.
csplit.split_text(text1)

['abcdefghijklmnopqrstuvwxyz1234567890']

In [12]:
csplit.split_text(text2)

['a b c d e f g h i j k l m n o p q r s t u v w x y z 1 2 3 4 5 6 7 8 9 0']

In [13]:
# A longer text, and a more realistic split
chunk_size =700
chunk_overlap = 5

rsplit = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
csplit = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [14]:
text3 = """ The Martians seem to have calculated their descent with amazing subtlety--their mathematical learning is evidently far in excess of ours--and to have carried out their prepara- tions with a well-nigh perfect unanimity. Had our instru- ments 
permitted it, we might have seen the gathering trouble far back in the nineteenth century. Men like Schiaparelli watched the red planet--it is odd, by-the-bye, that for count- less centuries Mars has been the star of war--but failed to interpret 
the fluctuating appearances of the markings they mapped so well. All that time the Martians must have been getting ready.
During the opposition of 1894 a great light was seen on the illuminated part of the disk, first at the Lick Observatory, then by Perrotin of Nice, and then by other observers. English readers heard of it first in the issue of NATURE dated August 2. I am 
inclined to think that this blaze may have been the casting of the huge gun, in the vast pit sunk into their planet, from which their shots were fired at us. Peculiar markings, as yet unexplained, were seen near the site of that outbreak during the next 
two oppositions.
The storm burst upon us six years ago now. As Mars approached opposition, Lavelle of Java set the wires of the astronomical exchange palpitating with the amazing intelli- gence of a huge outbreak of incandescent gas upon the planet. It had occurred towards 
midnight of the twelfth; and the spectroscope, to which he had at once resorted, indicated a mass of flaming gas, chiefly hydrogen, moving with an enormous velocity towards this earth. This jet of fire had become invisible about a quarter past twelve. He 
compared it to a colossal puff of flame suddenly and violently squirted out of the planet, as flaming gases rushed out of a gun. """

In [15]:
rsplit.split_text(text3)

['The Martians seem to have calculated their descent with amazing subtlety--their mathematical learning is evidently far in excess of ours--and to have carried out their prepara- tions with a well-nigh perfect unanimity. Had our instru- ments \npermitted it, we might have seen the gathering trouble far back in the nineteenth century. Men like Schiaparelli watched the red planet--it is odd, by-the-bye, that for count- less centuries Mars has been the star of war--but failed to interpret \nthe fluctuating appearances of the markings they mapped so well. All that time the Martians must have been getting ready.',
 'During the opposition of 1894 a great light was seen on the illuminated part of the disk, first at the Lick Observatory, then by Perrotin of Nice, and then by other observers. English readers heard of it first in the issue of NATURE dated August 2. I am \ninclined to think that this blaze may have been the casting of the huge gun, in the vast pit sunk into their planet, from whi

In [16]:
chunks=rsplit.split_text(text3)
for i, _ in enumerate(chunks):
    print(f"chunk # {i}, size: {len(chunks[i])}")

chunk # 0, size: 610
chunk # 1, size: 526
chunk # 2, size: 642


In [17]:
print(chunks[0])

The Martians seem to have calculated their descent with amazing subtlety--their mathematical learning is evidently far in excess of ours--and to have carried out their prepara- tions with a well-nigh perfect unanimity. Had our instru- ments 
permitted it, we might have seen the gathering trouble far back in the nineteenth century. Men like Schiaparelli watched the red planet--it is odd, by-the-bye, that for count- less centuries Mars has been the star of war--but failed to interpret 
the fluctuating appearances of the markings they mapped so well. All that time the Martians must have been getting ready.


In [18]:
# What happens if the split is smaller:
chunk_size =300
chunk_overlap = 5
rsplit = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)

In [19]:
chunks=rsplit.split_text(text3)
for i, _ in enumerate(chunks):
    print(f"chunk # {i}, size: {len(chunks[i])}")

chunk # 0, size: 240
chunk # 1, size: 245
chunk # 2, size: 121
chunk # 3, size: 253
chunk # 4, size: 271
chunk # 5, size: 256
chunk # 6, size: 254
chunk # 7, size: 128


In [20]:
print(chunks[0])

The Martians seem to have calculated their descent with amazing subtlety--their mathematical learning is evidently far in excess of ours--and to have carried out their prepara- tions with a well-nigh perfect unanimity. Had our instru- ments


In [21]:
# What happens if the split is smaller:
chunk_size =100
chunk_overlap = 5
rsplit = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)

In [22]:
chunks=rsplit.split_text(text3)
for i, _ in enumerate(chunks):
    print(f"chunk # {i}, size: {len(chunks[i])}")

chunk # 0, size: 92
chunk # 1, size: 96
chunk # 2, size: 52
chunk # 3, size: 99
chunk # 4, size: 95
chunk # 5, size: 54
chunk # 6, size: 96
chunk # 7, size: 29
chunk # 8, size: 98
chunk # 9, size: 95
chunk # 10, size: 61
chunk # 11, size: 97
chunk # 12, size: 90
chunk # 13, size: 73
chunk # 14, size: 16
chunk # 15, size: 97
chunk # 16, size: 92
chunk # 17, size: 74
chunk # 18, size: 97
chunk # 19, size: 96
chunk # 20, size: 69
chunk # 21, size: 93
chunk # 22, size: 37


In [23]:
print(chunks[0])

The Martians seem to have calculated their descent with amazing subtlety--their mathematical


In [24]:
# LlamaIndex has different splitters, including sentence splitter, that makes splits based on sentences (period). 

from llama_index.core.node_parser import SentenceSplitter

from llama_index.core.node_parser import SentenceSplitter

# Initialize the SentenceSplitter with your chunk size and overlap
ssplit = SentenceSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

# Apply the splitter on your text
split_texts = ssplit.split_text(text1)

# Display the split chunks
for idx, chunk in enumerate(split_texts):
    print(f"Chunk {idx + 1}:")
    print(chunk)
    print("\n" + "-" * 50 + "\n")



Chunk 1:
abcdefghijklmnopqrstuvwxyz1234567890

--------------------------------------------------



Both frameworks have splitters with similar goals, including for LangChain (beyond CharacterSplitter and RecursiveCharacterSplitter) HTML spliiter (HTMLHeaderTextSplitter, to chop HTNL pages while following the page structure), Code splitter (RecursiveCharacterSplitter with the option e.g. language=language.PYTHON), recursive JSON splitter (RecursiveJSONSplitter), Semantic splitter (SemanticChunker, splits in sentences and uses LLM to try to find semantic structures to group chunks that form cohernet semantic ensembles) and tokens, and for LlamaIndex (beyond SentenceSplitter and TokenTExtSplitter), HTML (HTMLNodeParser), JSON (JSONNodeParser), Markdown (MarkdownNodeParser), Code (CodeSplitter with the option e.g. language="python"), Hierarchical splitting (HierarchicalNodeParser, to attmept the best hierarchy of splits based on semantic meaning) and semantic splitting (SemanticSplitterNodeParser).

## More on Similarity search

The goal of the retrieval phase is to select the most relevant documents. But 'relevant' may mean 'repeating the same most relevant segment', which is suboptimal. 

In [25]:
# deleting leftovers from previous instances, as I run this codebook often
#tempdb.delete_collection()
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
# Set the environment variable to disable tokenizers parallelism and avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Let's define a directory where we'll store the database beyond this notebook execution (and let's make sure it is emtpy, as I run this notebook often :))
persist_directory = 'docs/chroma/'
embeddings = OllamaEmbeddings(model="nomic-embed-text")
!rm -rf ./docs/chroma  # remove old database files if any


In [26]:
text4 = [
    """The alien spaceships looked like flying saucers.""",
    """The alien spaceships were round in shape.""",
    """The spaceships were destroying everything.""",
]

In [27]:
tempdb = Chroma.from_texts(text4, embedding=embeddings)

In [28]:
question = "What can you tell me about the alien spaceships?"

In [29]:
tempdb.similarity_search(question, k=2)

[Document(metadata={}, page_content='The alien spaceships were round in shape.'),
 Document(metadata={}, page_content='The alien spaceships looked like flying saucers.')]

Similarity search points to the documents that are closest semantically to the question, which may include a lot of redundant information, and miss some key points, for example that the alien spaceships were destroying everything. Max Marginal Relevance (MMR) search improves Similarity Search by picking the top k as Similarity Search does, but returning the vectors that are farthest from each other (in this top k list), so as to maximize the diversity of information returned.

In [30]:
tempdb.max_marginal_relevance_search(question,k=2, fetch_k=3)

[Document(metadata={}, page_content='The alien spaceships were round in shape.'),
 Document(metadata={}, page_content='The spaceships were destroying everything.')]

## LangChain Chains and Tools

LangChain offers a series of tools that can be called using chains, making the process of using multiple sources for the LLM input data very flexible.

In [31]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser

In [32]:
from langchain.prompts import ChatPromptTemplate
from langchain.llms import Ollama


# Define a prompt template, used to build a prompt from various elements
prompt = ChatPromptTemplate.from_template(
    "tell me a short useful fact about {topic}"
)

# Use Ollama with Llama3 model
model = Ollama(model="llama3")

# Define the output parser, which simply takes the LLM output and displays it as a string
output_parser = StrOutputParser()

In [33]:
# With LangChain you can define a chain of elements, here the prompt (output) we built is redirected into the model (as input), the model output is redirected to the parser, which uses it as input to output... a string of what the model sent 
chain = prompt | model | output_parser

In [34]:
# Let's call the chain
chain.invoke({"topic": "Paris"})

'Here\'s one: Did you know that the famous street performers and artists in Montmartre, a charming neighborhood in Paris, are protected by law? Since 1896, the City of Paris has had an official "Street Performers\' Charter" that guarantees their right to perform and sell their wares without interference. This has helped preserve the unique atmosphere and artistic spirit of this iconic district!'

In [35]:
# The same logic can be used in RAG. Suppose that we have two chunks.

#!pip install docarray
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import DocArrayInMemorySearch

# Initialize the Ollama embedding model
embedding = OllamaEmbeddings(model="nomic-embed-text")

# Create the vector store with two chunks of text (for simplicity. Feel free to load a full pdf or video transcript as we did above in in the previous lessons if you prefer).
vectorstore = DocArrayInMemorySearch.from_texts(
    ["The Martians landed in the UK", "The river that flows in Paris is La Seine"],
    embedding=embedding
)

# Use the vector store as a retriever
retriever = vectorstore.as_retriever()

In [36]:
# Let's see what the retriever does when we ask a simple question - it should retrieve the available chunks, here both chunks irrespective of the question, as we do not run similarity search, just memory search (what is the the memory)
retriever.get_relevant_documents("where did the Martians land?")

  retriever.get_relevant_documents("where did the Martians land?")


[Document(metadata={}, page_content='The Martians landed in the UK'),
 Document(metadata={}, page_content='The river that flows in Paris is La Seine')]

In [37]:
# Now let's create a prompt template as above, that tells the LLM to use the context (i.e. the best chunk) to answer the question
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [38]:
# We want to pass the context (the 2 sentences retrieved from the vector store) and the question to the LLM. We use RunnableMap to create a dictionary with 2 elements, the context and the question
from langchain.schema.runnable import RunnableMap
inputs = RunnableMap({
    "context": lambda x: retriever.get_relevant_documents(x["question"]), # the context is retrieved by sending the question to the retriever, as we did manually 2 blocks above; the context is the first part of the dictionary
    "question": lambda x: x["question"] # the second element of the dictionary is the question itself 
})

In [39]:
# Let's see what the dictionary looks like
inputs.invoke({"question": "where did the Martians land?"})

{'context': [Document(metadata={}, page_content='The Martians landed in the UK'),
  Document(metadata={}, page_content='The river that flows in Paris is La Seine')],
 'question': 'where did the Martians land?'}

In [40]:
# We then want to pass the content of the dictionary to the prompt template (making a prompt of it), that is sent to the model, which output is sent to the parser, that makes a string from the answer. 
# Let's define the dictionary again, this time piping it into the prompt, then to the model, then to the parser, and we call the whole chain 'chain'
chain = RunnableMap({
    "context": lambda x: retriever.get_relevant_documents(x["question"]), 
    "question": lambda x: x["question"]  
}) | prompt | model | output_parser 

In [41]:
# What happens when we call the chain?
chain.invoke({"question": "where did the Martians land?"})

'According to the document, the Martians landed in the UK.'