In [9]:
from haystack.nodes import WebRetriever
from haystack.schema import Document
from typing import List 
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import AnswerParser, PromptNode, PromptTemplate
from haystack import Pipeline
from haystack.nodes import  DensePassageRetriever
import os
from dotenv import load_dotenv

# Load environment variables (if any)
load_dotenv("../.env")
serp = os.getenv("SERP_API_KEY")
openai_key = os.getenv("OPENAI_API_KEY")

In [10]:
def initialize_documents(serp_key, nl_query):
    """
    Initialize documents retrieved from the SERP API.

    Args:
        serp_key (str): API key for the SERP API.
        nl_query (str): Natural language query to retrieve documents for.
        
    """
     # Initialize WebRetriever
    retriever = WebRetriever(api_key=serp_key, 
                            mode="preprocessed_documents",
                            top_k=100)

    # Retrieve documents based a natural language query
    documents : List[Document] = retriever.retrieve(query=nl_query)

    return documents

def initialize_faiss_document_store(documents):
    """
    Initialize a FAISS document store and retriever.

    Args:
        documents (List[Document]): List of documents to be stored in the document store.

    Returns:
        document_store (FAISSDocumentStore): FAISS document store.
        retriever (DensePassageRetriever): Dense passage retriever.
    """
    
    # Initialize document store
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)

    retriever = DensePassageRetriever(
                document_store=document_store,
                query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                use_gpu=True,
                embed_title=True,
    )

    # Delete existing documents in document store
    document_store.delete_documents()
    document_store.write_documents(documents)

    # Add documents embeddings to index
    document_store.update_embeddings(retriever=retriever)

    return document_store, retriever

def initialize_rag_pipeline(document_store, retriever, openai_key):
    """
    Initialize a pipeline for RAG-based question answering.

    Args:
        document_store (FAISSDocumentStore): FAISS document store.
        retriever (DensePassageRetriever): Dense passage retriever.
        openai_key (str): API key for OpenAI.

    Returns:
        query_pipeline (Pipeline): Pipeline for RAG-based question answering.
    """
    prompt_template = PromptTemplate(prompt = """"Answer the following query based on the provided context. If the context does
                                                not include an answer, reply with 'The data does not contain information related to the question'.\n
                                                Query: {query}\n
                                                Documents: {join(documents)}
                                                Answer: 
                                            """,
                                            output_parser=AnswerParser())
    prompt_node = PromptNode(model_name_or_path = "gpt-4",
                            api_key = openai_key,
                            default_prompt_template = prompt_template)

    query_pipeline = Pipeline()
    query_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
    query_pipeline.add_node(component=prompt_node, name="PromptNode", inputs=["Retriever"])

    return query_pipeline

In [11]:
# Initialize documents
documents = initialize_documents(serp_key=serp, nl_query="IMDB movie reviews for the Barbie movie (2023)")

# Initialize document store and retriever
document_store, retriever = initialize_faiss_document_store(documents=documents)

# Initialize pipeline
query_pipeline = initialize_rag_pipeline(document_store=document_store, retriever=retriever, openai_key=openai_key)

Writing Documents: 10000it [00:00, 44002.81it/s]         
Documents Processed: 10000 docs [00:20, 497.08 docs/s]        


In [12]:
%%time
query_pipeline.run(query = "What is the Barbie movie about", params={"Retriever" : {"top_k": 1}})

1 out of the 1 completions have been truncated before reaching a natural stopping point. Increase the max_tokens parameter to allow for longer completions.


CPU times: user 1.3 s, sys: 950 ms, total: 2.25 s
Wall time: 6.69 s


{'answers': [<Answer {'answer': 'The Barbie movie is a fantasy comedy set in Barbie Land, a place where Barbie, played by Margot Robbie, is having the best day every day, in the company of other Barbies and Kens, played by Ryan Gosling. The movie presents a world where Barbie can be anything - a President, a Physicist, a Journalist, a Lawyer, or a Mermaid. The plot is set in motion when Barbie begins having thoughts about death and, normally suspended in the state of standing', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_ids': ['d50b5a310e8a57379d2ae6556fe6b8f'], 'meta': {'prompt': "Answer the following query based on the provided context. If the context does\n                                                not include an answer, reply with 'The data does not contain information related to the question'.\n\n                                                Query: What is the Barbie movie about\n\n               