<a href="https://colab.research.google.com/github/hissain/mlworks/blob/main/codes/RAG_haystack_library_DPR_NQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

For demonstration purposes, let us use a larger dataset to test the **Dense Passage Retrieval (DPR)** pipeline, we can use pre-built datasets that are commonly used for Question Answering (QA) tasks. One of the most popular datasets for this purpose is **Natural Questions**. These datasets are quite large and provide both questions and passages to test retrieval and reading models.

**Natural Questions (NQ) Dataset:**

The **Natural Questions (NQ)** dataset contains real user queries along with corresponding passages retrieved from Wikipedia. It's a great dataset for testing retrieval models.

To demonstrate the DPR pipeline with a larger dataset, we'll use a portion of the **Natural Questions Open (NQ)** dataset, which is publicly available via Hugging Face's `datasets` library.

In [None]:
# Step 1: Install Haystack
!pip install farm-haystack[inference] --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.2/152.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.7/48.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m763.9/763.9 kB[0m [31m12.6 MB/s[0m eta [36

In [None]:
# Step 1: Install Haystack
!pip install farm-haystack[faiss] datasets --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/527.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
# Step 2: Import Required Libraries
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever, FARMReader
from haystack.pipelines import ExtractiveQAPipeline
from haystack.utils import print_answers
from datasets import load_dataset


In [None]:
# Step 3: Set Up the FAISS Document Store
document_store = FAISSDocumentStore(embedding_dim=768)  # 768 is the embedding dimension used by DPR
# Please do not run it 2nd time. It will raise an exception.
# In case you need to rerun First delete the current runtime then run all.

In [None]:
# Step 4: Load NQ Dataset

# Load the 'train' split of Natural Questions Open dataset (subset)
nq_dataset = load_dataset("natural_questions", split="train[:100]")

# Convert the dataset into a format compatible with Haystack
documents = [{"content": item['context']} for item in nq_dataset]

print(f"Loaded {len(documents)} documents.")


# Write the documents to the document store
document_store.write_documents(documents)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/287 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/287 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/287 [00:00<?, ?files/s]

In [None]:
# Step 5: Initialize the Dense Passage Retriever (DPR)

"""
The Dense Passage Retriever (DPR) retrieves documents using dense embeddings.
DPR requires two models:

Query Embedding Model:   Embeds the query into a vector space.
Passage Embedding Model: Embeds the documents into the same vector space
                         for similarity comparison.
"""
# Initialize Dense Passage Retriever for dense vector-based retrieval
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=False  # Set to True if you want to use a GPU
)

# Update the document store with embeddings for the documents
document_store.update_embeddings(retriever)


In [None]:
# Step 6: Initialize the Reader
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)

In [None]:
# Step 7: Build the Pipeline
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)


In [None]:
# Step 8: Ask a Question and Get an Answer

query = "Where is the capital of France?"

# Run the pipeline and get answers
prediction = pipeline.run(query=query, params={"Retriever": {"top_k": 1}, "Reader": {"top_k": 1}})

# Print the answers
print_answers(prediction, details="minimum")


In [None]:
# Step 9: Test with Another Query

query_2 = "Who developed the theory of relativity?"

# Run the pipeline and get answers
prediction_2 = pipeline.run(query=query_2, params={"Retriever": {"top_k": 1}, "Reader": {"top_k": 1}})

# Print the answers
print_answers(prediction_2, details="minimum")
