# Contextual Compression and Filtering in RAG

### Installing dependencies

In [1]:
!pip install -qU langchain huggingface_hub lancedb pypdf python-dotenv transformers sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.6/803.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.1/330.1 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.9/283.9 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m229.6/229.6 kB[0m [31m16.0 MB

### Importing libraries

In [2]:
from langchain.llms import HuggingFaceHub
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
from langchain.llms import OpenAI
import lancedb
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from getpass import getpass
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline

In [4]:
import os
from getpass import getpass

os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass("Enter HuggingFace Hub Token:")

Enter HuggingFace Hub Token:··········


### Load the data

In [6]:
!wget https://www.centralvalleysd.org/Downloads/Science_Glossary.pdf

--2024-01-23 21:59:56--  https://www.centralvalleysd.org/Downloads/Science_Glossary.pdf
Resolving www.centralvalleysd.org (www.centralvalleysd.org)... 104.17.151.77, 104.16.209.8, 2606:4700::6811:974d, ...
Connecting to www.centralvalleysd.org (www.centralvalleysd.org)|104.17.151.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 40104 (39K) [application/pdf]
Saving to: ‘Science_Glossary.pdf’


2024-01-23 21:59:56 (40.9 MB/s) - ‘Science_Glossary.pdf’ saved [40104/40104]



In [7]:
loader = PyPDFLoader("Science_Glossary.pdf")
documents = loader.load()
print(len(documents))
print(documents[0].page_content)

7
SCIENCE GLOSSARY 
 
Abiotic:   A nonliving factor or element (e.g., light, w ater, heat, rock, energy, mineral). 
 
Acid deposition:  Precipitation w ith a pH less than 5.6 that form s in the atmosphere w hen certain pollutants mix 
with water vapor. 
 
Allele:   Any of a set of possible forms of a gene. 
 
Biochemical conversion:  The changing of organic matter into other chemical forms. 
 
Biological diversity:  The variety and complexity of species present a nd interacting in an ecosystem and the relative 
abundance of each. 
 
Biomass conversion: The changing of organic matter that has been produced by photosynthesis into useful liquid, gas 
or fuel. 
 
Biomedical technology: The application of health care theories  to develop methods, products and tools to maintain or 
improve homeostasis. 
 
Biomes:   A community of living organisms of a single major ecological region. 
 
Biotechnology:   The w ays that humans apply biological concepts  to produce products and provide services.

### Split texts

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=70)
final_doc = text_splitter.split_documents(documents)
print(len(final_doc))
print(final_doc[0])

22
page_content='SCIENCE GLOSSARY \n \nAbiotic:   A nonliving factor or element (e.g., light, w ater, heat, rock, energy, mineral). \n \nAcid deposition:  Precipitation w ith a pH less than 5.6 that form s in the atmosphere w hen certain pollutants mix \nwith water vapor. \n \nAllele:   Any of a set of possible forms of a gene. \n \nBiochemical conversion:  The changing of organic matter into other chemical forms. \n \nBiological diversity:  The variety and complexity of species present a nd interacting in an ecosystem and the relative \nabundance of each. \n \nBiomass conversion: The changing of organic matter that has been produced by photosynthesis into useful liquid, gas \nor fuel.' metadata={'source': 'Science_Glossary.pdf', 'page': 0}


### Embeddings

In [9]:
embeddings = SentenceTransformerEmbeddings(
    model_name="llmware/industry-bert-insurance-v0.1"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


### Load the LLM

In [10]:
repo_id = "llmware/bling-sheared-llama-1.3b-0.1"
llm = HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature": 0.3, "max_length": 500}
)



In [11]:
def pretty_print_docs(docs):
    print(
        f"\n{'-'* 100}\n".join(
            [f"Document{i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

### Instantiate VectorStore (LanceDB)

In [12]:
import lancedb

context_data = lancedb.connect("./.lancedb")
table = context_data.create_table(
    "context",
    data=[
        {
            "vector": embeddings.embed_query("Hello World"),
            "text": "Hello World",
            "id": "1",
        }
    ],
    mode="overwrite",
)

### Retriever

In [14]:
# initialize the retriever
from langchain_community.vectorstores import LanceDB

database = LanceDB.from_documents(final_doc, embeddings, connection=table)

In [15]:
retriever_d = database.as_retriever(search_kwargs={"k": 3})

In [16]:
docs = retriever_d.get_relevant_documents(query="What is Wetlands?")
pretty_print_docs(docs)

Document1:

body of w ater; also called a drainage basin. 
  
Wetlands: Lands w here w ater saturation is the domi nant factor determining the nature of the soil 
development and the plant and animal communities (e.g., sloughs, estuaries, marshes). 
 
 
 7
----------------------------------------------------------------------------------------------------
Document2:

developed state. 
 
Endangered species:   A species that is in danger of extinction throughout all or a significant portion of its range. 
 
Engineering:  The application of scientific, physical, mechan ical and mathematical principles to design 
processes, products and structures that improve the quality of life. 
 
Environment:  The total of the surroundings (air, w ater, soil, vege tation, people, w ildlife) influencing each living 
being’s existence, including physical, biological a nd all other factors; the surroundings of a plant 
or animals including other plants or animals, climate and location. 
 2
---------------

### Compressor

In [17]:
# creating the compressor
compressor = LLMChainExtractor.from_llm(llm=llm)

# compressor retriver = base retriever + compressor
compression_retriever = ContextualCompressionRetriever(
    base_retriever=retriever_d, base_compressor=compressor
)

In [18]:
print(compressor.llm_chain.prompt.template)

Given the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. 

Remember, *DO NOT* edit the extracted parts of the context.

> Question: {question}
> Context:
>>>
{context}
>>>
Extracted relevant parts:


In [19]:
os.environ["OPENAI_API_KEY "] = getpass()
embdeddings_filter = EmbeddingsFilter(embeddings=embeddings)
compression_retriever_filter = ContextualCompressionRetriever(
    base_retriever=retriever_d, base_compressor=embdeddings_filter
)

compressed_docs = compression_retriever_filter.get_relevant_documents(
    query="What is the Environment?"
)
pretty_print_docs(compressed_docs)

··········
Document1:

developed state. 
 
Endangered species:   A species that is in danger of extinction throughout all or a significant portion of its range. 
 
Engineering:  The application of scientific, physical, mechan ical and mathematical principles to design 
processes, products and structures that improve the quality of life. 
 
Environment:  The total of the surroundings (air, w ater, soil, vege tation, people, w ildlife) influencing each living 
being’s existence, including physical, biological a nd all other factors; the surroundings of a plant 
or animals including other plants or animals, climate and location. 
 2
----------------------------------------------------------------------------------------------------
Document2:

Niche (ecological) : The role played by an organism in an ecosystem; its food preferences, requirements for shelter, 
special behaviors and the timing of its activities  (e.g., nocturnal, diurnal), interaction w ith other 
organisms and its habitat.

### Retrieve answer from Compressed Data

In [20]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=compression_retriever_filter, verbose=True
)
# Ask Question
qa("What is Environment?")



[1m> Entering new RetrievalQA chain...[0m


  warn_deprecated(



[1m> Finished chain.[0m


{'query': 'What is Environment?', 'result': ' Environment'}

# Pipeline

In [21]:
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
relevant_filter = EmbeddingsFilter(embeddings=embeddings, k=5)

# creating the pipeline
compressed_pipeline = DocumentCompressorPipeline(
    transformers=[redundant_filter, relevant_filter]
)

# compressor retriever
comp_pipe_retrieve = ContextualCompressionRetriever(
    base_retriever=retriever_d, base_compressor=compressed_pipeline
)

# print the prompt
print(comp_pipe_retrieve)

# Get relevant documents
compressed_docs = comp_pipe_retrieve.get_relevant_documents(
    query="What is Environment?"
)
pretty_print_docs(compressed_docs)

base_compressor=DocumentCompressorPipeline(transformers=[EmbeddingsRedundantFilter(embeddings=HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
), model_name='llmware/industry-bert-insurance-v0.1', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False), similarity_fn=<function cosine_similarity at 0x7d6af0ae2b00>, similarity_threshold=0.95), EmbeddingsFilter(embeddings=HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'poolin

In [23]:
compressed_docs = comp_pipe_retrieve.get_relevant_documents(
    query="What is Environment?"
)
pretty_print_docs(compressed_docs)

Document1:

developed state. 
 
Endangered species:   A species that is in danger of extinction throughout all or a significant portion of its range. 
 
Engineering:  The application of scientific, physical, mechan ical and mathematical principles to design 
processes, products and structures that improve the quality of life. 
 
Environment:  The total of the surroundings (air, w ater, soil, vege tation, people, w ildlife) influencing each living 
being’s existence, including physical, biological a nd all other factors; the surroundings of a plant 
or animals including other plants or animals, climate and location. 
 2
----------------------------------------------------------------------------------------------------
Document2:

Niche (ecological) : The role played by an organism in an ecosystem; its food preferences, requirements for shelter, 
special behaviors and the timing of its activities  (e.g., nocturnal, diurnal), interaction w ith other 
organisms and its habitat. 
 
Nonpoin

In [24]:
compressed_docs = comp_pipe_retrieve.get_relevant_documents(
    query="What is Hazardous waste?"
)
pretty_print_docs(compressed_docs)

Document1:

and age relationships of rock units and the occu rrences of structural features, mineral deposits 
and fossil localities).  
 
Groundw ater:   Water that infiltrates the soil and is located in underground reservoirs called aquifers. 
 
Hazardous w aste: A solid that, because of its quantity or concentr ation or its physical, chemical or infectious 
characteristics, may cause or pose a substantial present or potential haz ard to human health or 
the environment w hen improperly treated, stored , transported or disposed of, or otherw ise 
managed. 
 
Homeostasis:   The tendency for a system to remain in a state of equilibrium by resisting change. 
 
 3
----------------------------------------------------------------------------------------------------
Document2:

tunnels, containers of various types). 
 
Radioactive isotope:  An atom that gives off nuclear radiation and has the same number of protons (atomic number) as 
another atom but a different number of neutrons. 
 
Recy