## Semantic Retrievers

In [1]:
import chromadb
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain_community.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma

In [2]:
chunk_size = 400
chunk_overlap = 100

# loading environment variables
#load_dotenv()

import os
with open('../../openai_api_key.txt') as f:
    api_key = f.read()
os.environ['OPENAI_API_KEY'] = api_key

# loading chat model
chat = ChatOpenAI()

# loading data
loader = WikipediaLoader(query="Steve Jobs", load_max_docs=5)
documents = loader.load()

# text splitting
text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)
docs = text_splitter.split_documents(documents=documents)

# embedding function
embedding_function = HuggingFaceBgeEmbeddings(
    model_name = "BAAI/bge-large-en-v1.5",
    model_kwargs = {'device':'cpu'},
    encode_kwargs = {'normalize_embeddings':True}
)

# vector store
db = Chroma.from_documents(docs, embedding_function, persist_directory="output/steve_jobs.db")

  warn_deprecated(


  lis = BeautifulSoup(html).find_all('li')
  from .autonotebook import tqdm as notebook_tqdm


### MultiQuery Retriever

### Uses LLM to generate multiple queries for a given user input query.

In [3]:
from langchain.retrievers.multi_query import MultiQueryRetriever
mq_retriever = MultiQueryRetriever.from_llm(retriever = db.as_retriever(), llm = chat)

In [4]:
# Set logging for the queries
import logging
logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)

In [5]:
query = "When was Steve Jobs fired from Apple?"
retrieved_docs = mq_retriever.get_relevant_documents(query=query)
retrieved_docs

INFO:langchain.retrievers.multi_query:Generated queries: ["1. What was the date of Steve Jobs' departure from Apple due to being fired?", '2. At what time was Steve Jobs terminated from his position at Apple?', '3. Can you provide information on the specific moment when Steve Jobs was ousted from Apple?']


[Document(page_content="In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer", metadata={'source': 'https://en.wikipedia.org/wiki/Steve_Jobs', 'summary': 'Steven Paul Jobs (February 24, 1955 – October 5, 2011) was an American businessman, inventor, and investor best known for co-founding the technology giant Apple Inc. Jobs was also the founder of NeXT and chairman and majority shareholder of Pixar. He was a pioneer of the personal computer revolution of the 1970s and 1980s, along with his early business partner and fellow Apple co-founder Steve Wozniak.\nJobs was born in San Francisco in 1955 and adopted shortly afterwards. He attended Reed Colleg

In [6]:
['1985' in doc.page_content for doc in retrieved_docs]

[True, True, False, False, False, False]

In [7]:
print(retrieved_docs[5].page_content)



### Contextual compression

#### Main idea is to compress the document based on the context of the query.
#### Retriver -> Documents -> Document Compressor -> Result

In [8]:
# Helper function for printing docs


def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [9]:
pretty_print_docs(docs)

Document 1:

Steven Paul Jobs (February 24, 1955 – October 5, 2011) was an American businessman, inventor, and investor best known for co-founding the technology giant Apple Inc. Jobs was also the founder of NeXT and chairman and majority shareholder of Pixar. He was a pioneer of the personal computer revolution of the 1970s and 1980s, along with his early business partner and fellow Apple co-founder Steve
----------------------------------------------------------------------------------------------------
Document 2:

of the 1970s and 1980s, along with his early business partner and fellow Apple co-founder Steve Wozniak.
----------------------------------------------------------------------------------------------------
Document 3:

Jobs was born in San Francisco in 1955 and adopted shortly afterwards. He attended Reed College in 1972 before withdrawing that same year. In 1974, he traveled through India, seeking enlightenment before later studying Zen Buddhism. He and Wozniak co-founde

In [10]:
query = "When was Steve Jobs fired from Apple?"
retriever = db.as_retriever()
sim_docs = retriever.get_relevant_documents(query=query)

pretty_print_docs(sim_docs)

Document 1:

In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer
----------------------------------------------------------------------------------------------------
Document 2:

In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer
-------------------------------------------------------------------------------

In [11]:
chat = ChatOpenAI(temperature=0)

#### LLMChainExtractor

In [12]:
# Document Compressor
# LLMChainExtractor - Iterates over the intially returned documents and extract from each, only the content relevant to the query.

from langchain.retrievers.document_compressors import LLMChainExtractor
compressor = LLMChainExtractor.from_llm(chat)

print(compressor.llm_chain.prompt.template)

Given the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. 

Remember, *DO NOT* edit the extracted parts of the context.

> Question: {question}
> Context:
>>>
{context}
>>>
Extracted relevant parts:


In [13]:
# Compression Retriever
from langchain.retrievers import ContextualCompressionRetriever
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
compression_retriever

ContextualCompressionRetriever(base_compressor=LLMChainExtractor(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], output_parser=NoOutputParser(), template='Given the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. \n\nRemember, *DO NOT* edit the extracted parts of the context.\n\n> Question: {question}\n> Context:\n>>>\n{context}\n>>>\nExtracted relevant parts:'), llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x000002932C9C5270>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x000002932C9C5E40>, temperature=0.0, openai_api_key='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', openai_proxy='')), get_input=<function default_get_input at 0x000002932A1B1120>), base_retriever=VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vec

In [14]:
compressed_docs = compression_retriever.get_relevant_documents(query = query)
print(compressed_docs)



[Document(page_content="In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley.", metadata={'source': 'https://en.wikipedia.org/wiki/Steve_Jobs', 'summary': 'Steven Paul Jobs (February 24, 1955 – October 5, 2011) was an American businessman, inventor, and investor best known for co-founding the technology giant Apple Inc. Jobs was also the founder of NeXT and chairman and majority shareholder of Pixar. He was a pioneer of the personal computer revolution of the 1970s and 1980s, along with his early business partner and fellow Apple co-founder Steve Wozniak.\nJobs was born in San Francisco in 1955 and adopted shortly afterwards. He attended Reed College in 1972 before withdrawing that same year. In 1974, he traveled through India, seeking enlightenment before later studying Zen Buddhism. He and Wozniak co-founded Apple in 1976 to further develop and sell Wozniak\'s Apple I personal computer. Together, the duo gained fame and weal

In [15]:
print(compressed_docs[0].page_content)

In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley.


#### LLMChainFilter

In [16]:
# Document Compressor
# LLMChainFilter - Uses LLM Chain to decide which of the initially retrieved documents to filter out and which ones to return

from langchain.retrievers.document_compressors import LLMChainFilter
compressor = LLMChainFilter.from_llm(chat)

print(compressor.llm_chain.prompt.template)

Given the following question and context, return YES if the context is relevant to the question and NO if it isn't.

> Question: {question}
> Context:
>>>
{context}
>>>
> Relevant (YES / NO):


In [17]:
from langchain.retrievers import ContextualCompressionRetriever
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents(query = query)
pretty_print_docs(compressed_docs)



Document 1:

In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer
----------------------------------------------------------------------------------------------------
Document 2:

In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer
-------------------------------------------------------------------------------

In [18]:
print(compressed_docs[0].page_content)

In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer


#### EmbeddingsFilter

In [19]:
# It embeds the documents and query and only returns documents which have sufficiently similar embeddings to query. 
from langchain.retrievers.document_compressors import EmbeddingsFilter
embeddings_filter  = EmbeddingsFilter(embeddings=embedding_function, similarity_threshold=0.6)

In [20]:
compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents(query = query)
pretty_print_docs(compressed_docs)

Document 1:

In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer
----------------------------------------------------------------------------------------------------
Document 2:

In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer
-------------------------------------------------------------------------------

In [21]:
print(compressed_docs[0].page_content)

In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer


### Parent Document Retriever
#### Splits documents for retrieval -> fetches small chunks -> looks up to parent ids for those chunks -> returns larger documents
#### Parent Document -> Document that a small chunk originated from

In [22]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

In [23]:
parent_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=1000, chunk_overlap=100)
child_splitter = CharacterTextSplitter(separator="\n", chunk_size=200, chunk_overlap=50)

store = InMemoryStore() # parent documents

In [24]:
par_doc_retriever = ParentDocumentRetriever(vectorstore=db, docstore=store, child_splitter=child_splitter, parent_splitter=parent_splitter)

In [25]:
par_doc_retriever.add_documents(docs)

In [26]:
par_doc_retriever.get_relevant_documents(query=query)

[Document(page_content="In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer", metadata={'title': 'Steve Jobs', 'summary': 'Steven Paul Jobs (February 24, 1955 – October 5, 2011) was an American businessman, inventor, and investor best known for co-founding the technology giant Apple Inc. Jobs was also the founder of NeXT and chairman and majority shareholder of Pixar. He was a pioneer of the personal computer revolution of the 1970s and 1980s, along with his early business partner and fellow Apple co-founder Steve Wozniak.\nJobs was born in San Francisco in 1955 and adopted shortly afterwards. He attended Reed College in 1972 before withdrawing th

### Time Weighted Vector Store Retriever

This retriever uses a combination of semantic similarity and a time decay.

The algorithm for scoring them is:

semantic_similarity + (1.0 - decay_rate) ^ hours_passed, hours passed = hours passed since object in the retriever was last accessed

In [28]:
import faiss
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.retrievers import TimeWeightedVectorStoreRetriever
from langchain_community.embeddings import FakeEmbeddings
from langchain_core.documents import Document


In [35]:

# low decay rate
embedding_function = FakeEmbeddings(size=300)
emb_size = 1024
index = faiss.IndexFlatL2(emb_size)
temp_db = FAISS(embedding_function, index, InMemoryDocstore({}),{})

tw_retriever = TimeWeightedVectorStoreRetriever(vectorstore = temp_db, decay_rate = 0.0000000000000000000000001, k=1)

In [36]:
from datetime import datetime, timedelta
yesterday = datetime.now() - timedelta(days=1)
retriever.add_documents(
    [Document(page_content="hello world")]
)
retriever.add_documents([Document(page_content="hello foo")])

['84ec0f65-fbb1-11ee-b0f4-40b0346aef46']

In [37]:
retriever.get_relevant_documents("hello world")

[Document(page_content='hello world'),
 Document(page_content='hello world'),
 Document(page_content='hello world'),
 Document(page_content='hello foo')]

In [41]:

# high decay rate
embedding_function = FakeEmbeddings(size=300)
emb_size = 1024
index = faiss.IndexFlatL2(emb_size)
temp_db = FAISS(embedding_function, index, InMemoryDocstore({}),{})

tw_retriever = TimeWeightedVectorStoreRetriever(vectorstore = temp_db, decay_rate = 0.999, k=1)

In [44]:
yesterday = datetime.now() - timedelta(days=1)
retriever.add_documents(
    [Document(page_content="hello world")]
)
retriever.add_documents([Document(page_content="hello foo")])

['2db53cd8-fbb2-11ee-ac1a-40b0346aef46']

In [45]:
retriever.get_relevant_documents("hello world")

[Document(page_content='hello world'),
 Document(page_content='hello world'),
 Document(page_content='hello world'),
 Document(page_content='hello world')]