In [50]:
import os 
import yaml
def read_config(path):
    """
    Reads API key from a configuration file.

    This function opens a configuration file named "apikeys.yml", reads the API key for OpenAI

    Returns:
    api_key (str): The API key for the Amadeus Flights API.
    """
    
    # Get the directory of the current script
    script_dir = path

    # Construct the full path to the configuration file
    file_path = os.path.join(script_dir, "apikeys.yml")

    with open(file_path, 'r') as stream:
        configs = yaml.safe_load(stream)
        API_KEY = configs['openai']['api_key']
            
    return API_KEY
path = r"C:\Users\johna\OneDrive\Documents\api_keys"  # Change to the location of your apikeys.yml
API_KEY = read_config(path)

In [1]:
from haystack.nodes import PreProcessor
from haystack.utils import convert_files_to_docs

# pre-process docs 
def preprocess_docs(doc_dir):
    all_docs = convert_files_to_docs(dir_path=doc_dir)
    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=False,
        split_by="word",
        split_respect_sentence_boundary=True,
        split_overlap=30, 
        split_length=100
    )
    docs = preprocessor.process(all_docs)
    print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")
    return docs

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
doc_dir = r"C:\Users\johna\anaconda3\envs\lfqa_env\haystack-lfqa\documents"
docs = preprocess_docs(doc_dir)

Preprocessing:   0%|                                                                           | 0/2 [00:00<?, ?docs/s]We found one or more sentences whose word count is higher than the split length.
Preprocessing: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.49docs/s]

n_files_input: 2
n_docs_output: 2149





In [3]:
# from sqlalchemy import create_engine
# engine = create_engine('sqlite:///faiss_document_store.db')  # Use the correct path to your SQLite DB file
# engine.execute("DROP TABLE document")  # Be careful with this, it will delete all your documents!


In [4]:
from haystack.document_stores import FAISSDocumentStore

# create FAISS in memory
def vector_stores(docs):
    document_store = FAISSDocumentStore(sql_url="sqlite:///:memory:", faiss_index_factory_str="Flat", embedding_dim=768)
    document_store.write_documents(docs)
    return document_store

document_store = vector_stores(docs)

Writing Documents: 10000it [00:04, 2165.81it/s]                                                                        


In [33]:
from haystack.nodes import EmbeddingRetriever, FARMReader
from haystack.pipelines import ExtractiveQAPipeline
from haystack.pipelines import Pipeline


# retreiver relevant docs
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/all-mpnet-base-v2"
)
document_store.update_embeddings(retriever)

# read relevant docs
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

document_qa = ExtractiveQAPipeline(reader=reader, retriever=retriever)
# pipe = Pipeline()
# pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
# document_qa = pipe.add_node(component=reader, name="Reader", inputs=["Retriever"])

Updating Embedding:   0%|                                                                  | 0/2149 [00:00<?, ? docs/s]
Batches:   0%|                                                                                  | 0/68 [00:00<?, ?it/s][A
Batches:   1%|█                                                                         | 1/68 [00:25<28:14, 25.29s/it][A
Batches:   3%|██▏                                                                       | 2/68 [00:36<18:56, 17.22s/it][A
Batches:   4%|███▎                                                                      | 3/68 [00:44<13:55, 12.85s/it][A
Batches:   6%|████▎                                                                     | 4/68 [00:54<12:42, 11.91s/it][A
Batches:   7%|█████▍                                                                    | 5/68 [01:02<10:54, 10.39s/it][A
Batches:   9%|██████▌                                                                   | 6/68 [01:10<09:47,  9.47s/it][A
Batches:  10%|█████

In [51]:
from haystack.agents import Agent
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser


lfqa_prompt = PromptTemplate(
    prompt="""You are a helpful and knowledgeable agent. To achieve your goal of answering complex questions
correctly, you have access to the following tools:

{tool_names_with_descriptions}

To answer questions, you'll need to go through multiple steps involving step-by-step thinking and
selecting appropriate tools and their inputs; tools will respond with observations. When you are ready
for a final answer, respond with the `Final Answer:`

Use the following format:

Question: the question to be answered
Thought: Reason if you have the final answer. If yes, answer the question. If not, find out the missing information needed to answer it.
Tool: pick one of {tool_names} 
Tool Input: the input for the tool
Observation: the tool will respond with the result
...

Final Answer: the final answer to the question, make it short (1-5 words)
Thought, Tool, Tool Input, and Observation steps can be repeated multiple times, but sometimes we can find an answer in the first pass
---

Question: {query}
Thought: Let's think step-by-step, I first need to
""",
    output_parser=AnswerParser(),
)

api_key = API_KEY
prompt_node = PromptNode(model_name_or_path="gpt-3.5-turbo", default_prompt_template=lfqa_prompt, api_key=api_key, stop_words=["Observation:"])
agent = Agent(prompt_node=prompt_node)


In [52]:
from haystack.agents import Tool

search_tool = Tool(
    name="document_qa",
    pipeline_or_node=document_qa,
    description="useful for when you need to answer any question",
    output_variable="answers",
)
agent.add_tool(search_tool)


In [None]:
result = agent.run("What are the Consumer Duty requirements for firms that only introduce retail clients?")
print(result["transcript"].split("---")[0])


Agent custom-at-query-time started with {'query': 'What are the Consumer Duty requirements for firms that only introduce retail clients?', 'params': None}


The 'transcript' parameter is missing from the Agent's prompt template. All ReAct agents that go through multiple steps to reach a goal require this parameter. Please append {transcript} to the end of the Agent's prompt template to ensure its proper functioning. A temporary prompt template with {transcript} appended will be used for this run.


[32munder[0m[32mstand[0m[32m what[0m[32m "[0m[32mConsumer[0m[32m Duty[0m[32m requirements[0m[32m"[0m[32m are[0m[32m.[0m[32m Once[0m[32m I[0m[32m have[0m[32m that[0m[32m information[0m[32m,[0m[32m I[0m[32m can[0m[32m determine[0m[32m the[0m[32m specific[0m[32m requirements[0m[32m for[0m[32m firms[0m[32m that[0m[32m only[0m[32m introduce[0m[32m retail[0m[32m clients[0m[32m.
[0m[32mTool[0m[32m:[0m[32m document[0m[32m_q[0m[32ma[0m[32m
[0m[32mTool[0m[32m Input[0m[32m:[0m[32m "[0m[32mConsumer[0m[32m Duty[0m[32m requirements[0m[32m"
[0m

Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21.89it/s]
Inferencing Samples:   0%|                                                                 | 0/1 [00:00<?, ? Batches/s]