In [1]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.WARNING)


In [2]:
import pandas as pd

from haystack.utils import fetch_archive_from_http


# Download sample
doc_dir = "data/tutorial7/"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/small_generator_dataset.csv.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Create dataframe with columns "title" and "text"
df = pd.read_csv(f"{doc_dir}/small_generator_dataset.csv", sep=",")
# Minimal cleaning
df.fillna(value="", inplace=True)

#print(df.head())
df.iloc[1,1]


'Albert Einstein Albert Einstein (; ; 14 March 1879 – 18 April 1955) was a German-born theoretical physicist who developed the theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics). His work is also known for its influence on the philosophy of science. He is best known to the general public for his mass–energy equivalence formula , which has been dubbed "the world\'s most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step'

In [3]:
df.shape

(75, 2)

In [5]:
df.head(15)

Unnamed: 0,title,text
0,"""Albert Einstein""",to Einstein in 1922. Footnotes Citations Albert Einstein Albert Einstein (; ...
1,"""Albert Einstein""",Albert Einstein Albert Einstein (; ; 14 March 1879 – 18 April 1955) was a Ge...
2,"""Albert Einstein""","observations were published in the international media, making Einstein worl..."
3,"""Albert Einstein""",model for depictions of mad scientists and absent-minded professors; his exp...
4,"""Alfred Nobel""","was adopted as the standard technology for mining in the ""Age of Engineering..."
5,"""Akira Kurosawa""","for 2020. Patrick Frater writing for ""Variety"" magazine in May 2017 stated t..."
6,"""Apple Inc.""","a near bezel-less design along with wireless charging. On September 12, 2018..."
7,"""Akira Kurosawa""",through the Second World War and beyond. The narrative centers on yearly bir...
8,"""Apple Inc.""","2016, Apple introduced the iPhone 7 and the iPhone 7 Plus, which feature imp..."
9,"""Apple Inc.""","a faster processor, and brighter display. On September 12, 2017, Apple intro..."


In [6]:
from haystack import Document


# Use data to initialize Document objects
titles = list(df["title"].values)
texts = list(df["text"].values)
documents = []
for title, text in zip(titles, texts):
    documents.append(Document(content=text, meta={"name": title or ""}))


In [7]:
print(documents[2])

<Document: id=4a347e0fc7c1b7d06fa7d7a2aa580555, content='observations were published in the international media, making Einstein world-famous. On 7 November ...'>


In [8]:
print(documents[0].embedding)

None


In [9]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch



In [10]:
import time
time.sleep(30)


In [11]:
from haystack.utils import launch_es
launch_es()


In [12]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

# document_store = ElasticsearchDocumentStore(
#     host=host,
#     username="",
#     password="",
#     index="document",
#     embedding_field="embedding",
#     embedding_dim=1536
# )

document_store = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index="document-small-test",
    search_fields = ["title", "text"],
    embedding_field="embedding", 
    excluded_meta_data=["embedding"],
    embedding_dim=1536)


In [13]:
from haystack.nodes import EmbeddingRetriever


# OpenAI EmbeddingRetriever
retriever = EmbeddingRetriever(
   document_store=document_store,
   batch_size=8,
   embedding_model="text-embedding-ada-002",
   api_key="<insert-openAI-API-key-here>",
   max_seq_len=8191,
)

In [14]:

document_store.write_documents(documents)

CPU times: user 4.41 ms, sys: 2.67 ms, total: 7.08 ms
Wall time: 901 ms


In [16]:
#documents = document_store.get_all_documents(return_embedding=True)
print(documents[0].embedding)

[-0.00693703 -0.02328351  0.01684615 ... -0.00392277 -0.00844255
  0.00470797]


In [17]:

document_store.update_embeddings(retriever)
print(documents[0].embedding)

Updating embeddings:   0%|          | 0/68 [00:00<?, ? Docs/s]

Calculating embeddings:   0%|          | 0/9 [00:00<?, ?it/s]

[-0.00693703 -0.02328351  0.01684615 ... -0.00392277 -0.00844255
  0.00470797]
CPU times: user 523 ms, sys: 120 ms, total: 643 ms
Wall time: 15.9 s


In [16]:
from haystack.nodes import OpenAIAnswerGenerator

generator = OpenAIAnswerGenerator(model="text-davinci-003", api_key="<insert-openai-api-key-here>")

In [18]:
from haystack.pipelines import GenerativeQAPipeline

pipeline = GenerativeQAPipeline(generator=generator, retriever=retriever)
result = pipeline.run(query='who is Albert Einstein?', params={"Retriever": {"top_k": 5}})

Calculating embeddings: 100%|█████████████████████| 1/1 [00:01<00:00,  1.12s/it]


In [19]:
from pprint import pprint

pprint(result)


{'answers': [<Answer {'answer': ' Albert Einstein was a German-born theoretical physicist who developed the theory', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['b85487f3f7fd1b9d5d4f4ed0aea4ef09', 'ab6983f94425886f00fd550b2ac0f037', 'd34ecd3aeacc8cbd33555e07f9a98c73', '4a347e0fc7c1b7d06fa7d7a2aa580555', '3c527839680f7f5ff3cb758454fa5079'], 'doc_scores': [0.5022289852339108, 0.5022138855319724, 0.5021411119123516, 0.5020899878276465, 0.5019322403810851], 'content': ['Albert Einstein Albert Einstein (; ; 14 March 1879 – 18 April 1955) was a German-born theoretical physicist who developed the theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics). His work is also known for its influence on the philosophy of science. He is best known to the general public for his mass–energy equivalence formula , which has been dubbed "the world\'s most famous 