In [94]:
doc_path = "/content/Retrieval-Augmented-Generation-for-NLP.pdf"

In [95]:
!pip install pypdf



In [96]:
!pip install langchain_community



In [97]:
from langchain_community.document_loaders import PyPDFLoader

In [98]:
loader = PyPDFLoader(doc_path)

In [99]:
docs = loader.load()



In [100]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [101]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)

In [102]:
chunks = splitter.split_documents(docs)

In [103]:
chunks

[Document(metadata={'source': '/content/Retrieval-Augmented-Generation-for-NLP.pdf', 'page': 0}, page_content='Retrieval-Augmented Generation for\nKnowledge-Intensive NLP Tasks\nPatrick Lewis†‡, Ethan Perez?,\nAleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,\nMike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†\n†Facebook AI Research;‡University College London;?New York University;\nplewis@fb.com\nAbstract\nLarge pre-trained language models have been shown to store factual knowledge\nin their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-\nstream NLP tasks. However, their ability to access and precisely manipulate\nknowledge is still limited, and hence on knowledge-intensive tasks, their perfor-\nmance lags behind task-speciﬁc architectures. Additionally, providing provenance\nfor their decisions and updating their world knowledge remain open research prob-\nlems. Pre-trained models with a diff

In [104]:
!pip install --upgrade --quiet  langchain-google-genai

In [105]:
GOOGLE_API_KEY = "AIzaSyBBQguUqfwXIH2GXeOx7ziJVVEw8DGj6II"

In [106]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(google_api_key=GOOGLE_API_KEY,model="models/embedding-001")

In [107]:
!pip install chromadb



In [108]:
from langchain.vectorstores import Chroma

In [109]:
vectorstore = Chroma.from_documents(chunks, embeddings)

In [118]:
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k":3})

In [119]:
vectorstore_retreiver

VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7b5b4c53d2a0>, search_kwargs={'k': 3})

In [120]:
!pip install rank_bm25



In [121]:
from langchain.retrievers import BM25Retriever , EnsembleRetriever

In [122]:
keyword_retriever = BM25Retriever.from_documents(chunks)

In [123]:
keyword_retriever.k =  3

Mixing vector search and keyword search for Hybrid search
# hybrid_score = (1 — alpha) * sparse_score + alpha * dense_score

In [124]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,keyword_retriever],weights=[0.3, 0.7])

In [125]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.3


In [126]:
!pip install accelerate



In [127]:
import torch
from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline )
from langchain import HuggingFacePipeline

In [128]:
# function for loading 4-bit quantized model
def load_quantized_model(model_name: str):
    """
    model_name: Name or path of the model to be loaded.
    return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
    )
    return model

In [129]:
# initializing tokenizer
def initialize_tokenizer(model_name: str):
    """
    model_name: Name or path of the model for tokenizer initialization.
    return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer

In [134]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [133]:
tokenizer = initialize_tokenizer(model_name)

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [135]:
model = load_quantized_model(model_name)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [136]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)


In [137]:
llm = HuggingFacePipeline(pipeline=pipeline)

  llm = HuggingFacePipeline(pipeline=pipeline)


In [138]:
from langchain.chains import RetrievalQA

In [139]:
normal_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vectorstore_retreiver
)

In [140]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=ensemble_retriever
)

In [148]:
response1 = normal_chain.invoke("What is RAG ?")

In [149]:
response1

{'query': 'What is RAG ?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nbest practice and use pairwise comparative evaluation [ 34]. Evaluators are shown an answer and two\ngenerated questions, one from BART and one from RAG. They are then asked to pick one of four\noptions—quuestion A is better, question B is better, both are good, or neither is good.\n3.4 Fact Veriﬁcation\nFEVER [ 56] requires classifying whether a natural language claim is supported or refuted by\nWikipedia, or whether there is not enough information to decide. The task requires retrieving\nevidence from Wikipedia relating to the claim and then reasoning over this evidence to classify\nwhether the claim is true, false, or unveriﬁable from Wikipedia alone. FEVER is a retrieval problem\ncoupled with an challenging entailment reasoning task. It also provides an appropriate testbed for\nex

In [150]:
print(response1.get("result"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

best practice and use pairwise comparative evaluation [ 34]. Evaluators are shown an answer and two
generated questions, one from BART and one from RAG. They are then asked to pick one of four
options—quuestion A is better, question B is better, both are good, or neither is good.
3.4 Fact Veriﬁcation
FEVER [ 56] requires classifying whether a natural language claim is supported or refuted by
Wikipedia, or whether there is not enough information to decide. The task requires retrieving
evidence from Wikipedia relating to the claim and then reasoning over this evidence to classify
whether the claim is true, false, or unveriﬁable from Wikipedia alone. FEVER is a retrieval problem
coupled with an challenging entailment reasoning task. It also provides an appropriate testbed for
exploring the RAG models’ ability to handle classiﬁc

In [151]:
response2 = hybrid_chain.invoke("What is RAG ?")

In [152]:
response2

{'query': 'What is RAG ?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nwith both models outperforming BART on Q-BLEU-1. Table 4 shows human evaluation results, over\n452 pairs of generations from BART and RAG-Token. Evaluators indicated that BART was more\nfactual than RAG in only 7.1% of cases, while RAG was more factual in 42.7% of cases, and both\nRAG and BART were factual in a further 17% of cases, clearly demonstrating the effectiveness of\nRAG on the task over a state-of-the-art generation model. Evaluators also ﬁnd RAG generations to\nbe more speciﬁc by a large margin. Table 3 shows typical generations from each model.\nJeopardy questions often contain two separate pieces of information, and RAG-Token may perform\nbest because it can generate responses that combine content from several documents. Figure 2 shows\nan example. When generating “Sun”, 

In [153]:
print(response2.get("result"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

with both models outperforming BART on Q-BLEU-1. Table 4 shows human evaluation results, over
452 pairs of generations from BART and RAG-Token. Evaluators indicated that BART was more
factual than RAG in only 7.1% of cases, while RAG was more factual in 42.7% of cases, and both
RAG and BART were factual in a further 17% of cases, clearly demonstrating the effectiveness of
RAG on the task over a state-of-the-art generation model. Evaluators also ﬁnd RAG generations to
be more speciﬁc by a large margin. Table 3 shows typical generations from each model.
Jeopardy questions often contain two separate pieces of information, and RAG-Token may perform
best because it can generate responses that combine content from several documents. Figure 2 shows
an example. When generating “Sun”, the posterior is high for document 2 which mentio

In [163]:
response1 = normal_chain.invoke("What is conclusion of this research ?")

In [164]:
response1

{'query': 'What is conclusion of this research ?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nused beam search with a beam size of 4and length penalty α= 0.6[31]. These hyperparameters\nwere chosen after experimentation on the development set. We set the maximum output length during\ninference to input length + 50, but terminate early when possible [31].\nTable 2 summarizes our results and compares our translation quality and training costs to other model\narchitectures from the literature. We estimate the number of ﬂoating point operations used to train a\nmodel by multiplying the training time, the number of GPUs used, and an estimate of the sustained\nsingle-precision ﬂoating-point capacity of each GPU5.\n6.2 Model Variations\nTo evaluate the importance of different components of the Transformer, we varied our base model\nin different ways, measuring

In [165]:
print(response1.get("result"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

used beam search with a beam size of 4and length penalty α= 0.6[31]. These hyperparameters
were chosen after experimentation on the development set. We set the maximum output length during
inference to input length + 50, but terminate early when possible [31].
Table 2 summarizes our results and compares our translation quality and training costs to other model
architectures from the literature. We estimate the number of ﬂoating point operations used to train a
model by multiplying the training time, the number of GPUs used, and an estimate of the sustained
single-precision ﬂoating-point capacity of each GPU5.
6.2 Model Variations
To evaluate the importance of different components of the Transformer, we varied our base model
in different ways, measuring the change in performance on English-to-German translation on the
develop

In [160]:
response2 = hybrid_chain.invoke("What is conclusion of this research ?")

In [161]:
response2

{'query': 'What is conclusion of this research ?',
 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nrather than related training pairs. This said, RAG techniques may work well in these settings, and\ncould represent promising future work.\n6 Discussion\nIn this work, we presented hybrid generation models with access to parametric and non-parametric\nmemory. We showed that our RAG models obtain state of the art results on open-domain QA. We\nfound that people prefer RAG’s generation over purely parametric BART, ﬁnding RAG more factual\nand speciﬁc. We conducted an thorough investigation of the learned retrieval component, validating\nits effectiveness, and we illustrated how the retrieval index can be hot-swapped to update the model\nwithout requiring any retraining. In future work, it may be fruitful to investigate if the two components\ncan be jointly pre-tr

In [162]:
print(response2.get("result"))

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

rather than related training pairs. This said, RAG techniques may work well in these settings, and
could represent promising future work.
6 Discussion
In this work, we presented hybrid generation models with access to parametric and non-parametric
memory. We showed that our RAG models obtain state of the art results on open-domain QA. We
found that people prefer RAG’s generation over purely parametric BART, ﬁnding RAG more factual
and speciﬁc. We conducted an thorough investigation of the learned retrieval component, validating
its effectiveness, and we illustrated how the retrieval index can be hot-swapped to update the model
without requiring any retraining. In future work, it may be fruitful to investigate if the two components
can be jointly pre-trained from scratch, either with a denoising objective similar to BART or s