<a href="https://colab.research.google.com/github/sunnysavita10/Generative-AI-Indepth-Basic-to-Advance/blob/main/basic_retrieval_and_contextual_compression_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://github.com/langchain-ai/langchain/tree/master/libs/langchain/langchain/retrievers/document_compressors

https://blog.langchain.dev/improving-document-retrieval-with-contextual-compression/

In [None]:
!pip install langchain_community

In [None]:
!pip install langchain_openai

In [None]:
#facebook ai similarity search
!pip install faiss-cpu

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

In [None]:
documents = TextLoader("/content/state_of_the_union.txt").load()

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [None]:
texts = text_splitter.split_documents(documents)

In [None]:
texts

In [None]:
from google.colab import userdata
OPENAI_API_KEY=userdata.get('OPENAI_API_KEY')

In [None]:
import os
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
retriever = FAISS.from_documents(texts, OpenAIEmbeddings()).as_retriever()

In [None]:
docs = retriever.invoke("What did the president say about Ketanji Brown Jackson")

In [None]:
# Helper function for printing docs

def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [None]:
pretty_print_docs(docs)

In [None]:
docs2 = retriever.invoke("What were the top three priorities outlined in the most recent State of the Union address?")

In [None]:
pretty_print_docs(docs2)

In [None]:
docs3 = retriever.invoke("How did the President propose to tackle the issue of climate change?")


In [None]:
pretty_print_docs(docs3)

In [None]:
from langchain_openai import OpenAI

In [None]:
llm=OpenAI(temperature=0)

In [None]:
from langchain.chains import RetrievalQA

In [None]:
chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

In [None]:
query="What were the top three priorities outlined in the most recent State of the Union address?"

In [None]:
chain.invoke(query)

In [None]:
print(chain.invoke(query)['result'])

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_openai import OpenAI

In [None]:
compressor = LLMChainExtractor.from_llm(llm)

In [None]:
compression_retriever=ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

In [None]:
compressed_docs = compression_retriever.invoke("What did the president say about Ketanji Jackson Brown")

In [None]:
compressed_docs

In [None]:
compressed_docs = compression_retriever.invoke("What were the top three priorities outlined in the most recent State of the Union address?")

In [None]:
compressed_docs

In [None]:
pretty_print_docs(compressed_docs)

In [None]:
compressed_docs2 = compression_retriever.invoke("How did the President propose to tackle the issue of climate change?")

In [None]:
pretty_print_docs(compressed_docs2)

In [None]:
from langchain.retrievers.document_compressors import LLMChainFilter

In [None]:
filter = LLMChainFilter.from_llm(llm)

In [None]:
compression_retriever2 = ContextualCompressionRetriever(base_compressor=filter, base_retriever=retriever)

In [None]:
compressed_docs3 = compression_retriever2.invoke("What were the top three priorities outlined in the most recent State of the Union address?")

In [None]:
pretty_print_docs(compressed_docs3)

In [None]:
original_contexts_len = len("\n\n".join([d.page_content for i, d in enumerate(docs2)]))

In [None]:
original_contexts_len

In [None]:
compressed_contexts_len = len("\n\n".join([d.page_content for i, d in enumerate(compressed_docs)]))

In [None]:
compressed_contexts_len

In [None]:
print("Original context length:", original_contexts_len)

In [None]:
print("Compressed context length:", compressed_contexts_len)

In [None]:
print("Compressed Ratio:", f"{original_contexts_len/(compressed_contexts_len + 1e-5):.2f}x")

In [None]:
from langchain.retrievers.document_compressors import EmbeddingsFilter

In [None]:
from langchain_openai import OpenAIEmbeddings

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
embeddings_filter = EmbeddingsFilter(embeddings=embeddings)

In [None]:
compression_retriever3 = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)

In [None]:
compressed_docs4 = compression_retriever3.invoke("What were the top three priorities outlined in the most recent State of the Union address?")

In [None]:
pretty_print_docs(compressed_docs4)

In [None]:
print("Original context length:", original_contexts_len)

In [None]:
compressed_contexts_len = len("\n\n".join([d.page_content for i, d in enumerate(compressed_docs)]))

In [None]:
print("Compressed context length:", compressed_contexts_len)

In [None]:
print("Compressed Ratio:", f"{original_contexts_len/(compressed_contexts_len + 1e-5):.2f}x")

In [None]:
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain_text_splitters import CharacterTextSplitter


In [None]:
splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=". ")

In [None]:
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)

In [None]:
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)

In [None]:
pipeline_compressor = DocumentCompressorPipeline(transformers=[splitter, redundant_filter, relevant_filter])

In [None]:
compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=retriever)

In [None]:
compressed_docs = compression_retriever.invoke("What were the top three priorities outlined in the most recent State of the Union address?")

In [None]:
pretty_print_docs(compressed_docs)

In [None]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(temperature=0)

In [None]:
from langchain.chains import RetrievalQA

In [None]:
chain = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever)

In [None]:
query="What were the top three priorities outlined in the most recent State of the Union address?"

In [None]:
chain.invoke(query)

In [None]:
print(chain.invoke(query)['result'])

The top three priorities outlined in the most recent State of the Union address were:

1. Beating the opioid epidemic by increasing funding for prevention, treatment, harm reduction, and recovery.
2. Strengthening infrastructure and innovation in America to improve transportation and create more jobs.
3. Promoting domestic production and reducing reliance on foreign supply chains to boost the economy and create more opportunities for Americans.