In [None]:
!pip install farm-haystack sentence_transformers farm-haystack[pinecone] farm-haystack[opensearch] pinecone-client langchain requests_aws4auth

In [None]:
from haystack.nodes import PreProcessor, PDFToTextConverter, TextConverter
from haystack.utils import convert_files_to_docs

# Define data converter eg. PDF to characters
converter = PDFToTextConverter(id_hash_keys=["meta"])
# Define Preprocessor to perform chunking, overlap, etc on characters from converter
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=300,
    split_overlap=20,
    split_respect_sentence_boundary=True,
)


In [None]:
from requests_aws4auth import AWS4Auth
from haystack.document_stores import OpenSearchDocumentStore

# Setup aws4auth object to use with OpenSearchDocumentStore

aws_access_key = "key"
aws_secret_key = "secret"
region = "us-east-1"
open_search_domain = "search-test-cluster-xxxxxxxxx.us-east-1.es.amazonaws.com"

aws_auth = AWS4Auth(aws_access_key, aws_secret_key, region, "es", session_token=None)

# Setup OpenSearch Doc Store
document_store = OpenSearchDocumentStore(
    aws4auth=aws_auth,
    username='uname',
    password='pw',
    create_index=True,
    host=open_search_domain,
    port='443',
    scheme='https',
    embedding_field="embedding",
    embedding_dim=384,)


In [None]:
# Removes any documents to start fresh
document_store.delete_documents()

In [None]:
document_store.debug=True

In [None]:
from haystack.nodes import EmbeddingRetriever, PromptNode
# Define retriever to get embeddings and perform semantics
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="multi-qa-MiniLM-L6-cos-v1",
    model_format="sentence_transformers"
)

In [None]:
from haystack.pipelines import Pipeline

# Define pipeline and add all of the necessary pieces in ORDER

p = Pipeline()

p.add_node(component=converter, name="PDFConverter", inputs=["File"])
p.add_node(component=preprocessor, name="PreProcessor", inputs=["PDFConverter"])
p.add_node(component=retriever, name="Retriever", inputs=["PreProcessor"])
p.add_node(component=document_store, name="DocumentStore", inputs=["Retriever"])

# Run pipeline on document and add metadata to include doc name

p.run(file_paths=["./sample_data/A75_18-en.pdf"], meta={"document_name": "A75_18-en.pdf"})

In [None]:
# Once documents are ran through the pipeline, use this to add embeddings to the datastore

document_store.update_embeddings(
    retriever,
    batch_size=16,
)

In [None]:
# Verify docs and embeddings are equal

document_store.get_document_count() == document_store.get_embedding_count()

True

In [None]:
prompt_node = PromptNode(model_name_or_path="gpt-3.5-turbo",
                         api_key='key')

In [None]:
from haystack.pipelines.standard_pipelines import DocumentSearchPipeline

pipe = DocumentSearchPipeline(retriever)

In [None]:
from haystack.utils import print_answers
from pprint import pprint
query = "What is the state referring to"
# get the answer
answer = pipe.run(
    query=query,
    params={
        "Retriever": {
            "top_k": 3,
            "filters": {
                "meta": {
                    "document_name": 'A75_18-en.pdf'
                }
            }
        },
        # "Reader": {
        #     "top_k": 1
        # }
    }
)
pprint(answer)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'documents': [<Document: {'content': 'Each State Party shall assess events occurring within its territory by using the decision instrument\nin Annex 2 within 48 hours of the National IHR Focal Point receiving the relevant information.\nEach State Party shall notify WHO, by the most efficient means of communication available, by way of\nthe National IHR Focal Point, and within 24 hours of assessment of public health information, of all\nevents which may constitute a public health emergency of international concern within its territory in\naccordance with the decision instrument, as well as any health measure implemented in response to\nthose events. If the notification received by WHO involves the competency of the International Atomic\nEnergy Agency (IAEA), the Food and Agriculture Organization (FAO), the World Organisation\nfor Animal Health (OIE), the UN Environment Programme (UNEP) or other relevant entities,\nWHO shall immediately notify the IAEA relevant entities.\n2. Following a

In [None]:
answer.keys()
answer['answers']

In [None]:
answer['documents']

[<Document: {'content': 'Each State Party shall assess events occurring within its territory by using the decision instrument\nin Annex 2 within 48 hours of the National IHR Focal Point receiving the relevant information.\nEach State Party shall notify WHO, by the most efficient means of communication available, by way of\nthe National IHR Focal Point, and within 24 hours of assessment of public health information, of all\nevents which may constitute a public health emergency of international concern within its territory in\naccordance with the decision instrument, as well as any health measure implemented in response to\nthose events. If the notification received by WHO involves the competency of the International Atomic\nEnergy Agency (IAEA), the Food and Agriculture Organization (FAO), the World Organisation\nfor Animal Health (OIE), the UN Environment Programme (UNEP) or other relevant entities,\nWHO shall immediately notify the IAEA relevant entities.\n2. Following a notification,

In [None]:
from langchain.chat_models import ChatOpenAI
chat = ChatOpenAI(max_tokens=500, model='gpt-3.5-turbo', openai_api_key='key')

In [None]:
from langchain.schema import AIMessage, HumanMessage, SystemMessage
messages = [
    SystemMessage(
        content="You are a helpful assistant that uses provided context to answer questions concisely. \
        If the answer is not provided in the context, reply, 'I don't know'"
    ),
    HumanMessage(
        content="Question: What is state party? Context: {0}".format(answer['documents'])
    )
]
chat(messages)

AIMessage(content='A state party refers to a country that is a party to an international agreement or treaty. In the given context, it refers to countries that are signatories to the International Health Regulations (IHR) and have certain obligations, such as assessing events occurring within their territory and notifying the World Health Organization (WHO) of public health emergencies of international concern.', additional_kwargs={}, example=False)