In [1]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [3]:
# Recommended: Start Elasticsearch using Docker via the Haystack utility function
from haystack.utils import launch_es

launch_es()

In [1]:
import time
time.sleep(30)

In [3]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document")


In [None]:
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http


# Let's first fetch some documents that we want to query
# Here: 517 Wikipedia articles for Game of Thrones
doc_dir = "/Users/kpham/Desktop/coding-kim/haystack/combio_clean/singletxt"
# s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip"
# fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Convert files to dicts
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

# We now have a list of dictionaries that we can write to our document store.
# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.
# The default format here is:
# {
#    'content': "<DOCUMENT_TEXT_HERE>",
#    'meta': {'name': "<DOCUMENT_NAME_HERE>", ...}
# }
# (Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and
# can be accessed later for filtering or shown in the responses of the Pipeline)

# Let's have a look at the first 3 entries:
print(docs[:3])

# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(docs)



In [6]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)


In [7]:
from haystack.nodes import FARMReader

# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)


Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/473M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [8]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)


In [None]:
# You can configure how many candidates the Reader and Retriever shall return
# The higher top_k_retriever, the better (but also the slower) your answers.
prediction = pipe.run(
    query="What year is it?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

from pprint import pprint

pprint(prediction)

# Sample output:
# {
#     'answers': [ <Answer: answer='Eddard', type='extractive', score=0.9919578731060028, offsets_in_document=[{'start': 608, 'end': 615}], offsets_in_context=[{'start': 72, 'end': 79}], document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
#                  <Answer: answer='Ned', type='extractive', score=0.9767240881919861, offsets_in_document=[{'start': 3687, 'end': 3801}], offsets_in_context=[{'start': 18, 'end': 132}], document_id='9acf17ec9083c4022f69eb4a37187080', meta={'name': '454_Music_of_Game_of_Thrones.txt'}}, context='...' >,
#                  ...
#                ]
#     'documents': [ <Document: content_type='text', score=0.8034909798951382, meta={'name': '332_Sansa_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe', content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
#                    <Document: content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2', 'content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
#                    ...
#                  ],
#     'no_ans_gap':  11.688868522644043,
#     'node_id': 'Reader',
#     'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
#     'query': 'Who is the father of Arya Stark?',
#     'root_node': 'Query'
# }


In [10]:
from haystack.utils import print_answers

# Change `minimum` to `medium` or `all` to raise the level of detail
print_answers(prediction, details="minimum")


Query: What year is it?
Answers:
[   {   'answer': '1963',
        'context': '] started running out of names, apparently. I was '
                   'born\\nSeptember 27th , 1963, in Ohio. So I\\u2019m a '
                   'Buckeye.\\n\\nVAN BENSCHOTEN: In what part of Ohio'},
    {   'answer': 'e st',
        'context': 'f\\nsituations that clearly affected what I ultimately '
                   'wanted to be, or the studies that I pursued. So I\\nthink '
                   'from that standpoint, since I was the e'},
    {   'answer': 'e"',
        'context': '",\n'
                   '            "Medical scientists",\n'
                   '            "Science History Institute"\n'
                   '            "Jochen Buck was born and grew up in '
                   'Reutlingen, Germany, in '},
    {   'answer': '1946',
        'context': 'esman, January 29, 2009\\n\\nJG:\\n\\n2\\n\\nWhat year was '
                   'that?\\n\\nMG: It was 1946. October 7, '
               

In [None]:
prediction2 = pipe.run(
    query="What did I study?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

from pprint import pprint

pprint(prediction2)

In [12]:
from haystack.utils import print_answers

# Change `minimum` to `medium` or `all` to raise the level of detail
print_answers(prediction2, details="minimum")


Query: What did I study?
Answers:
[   {   'answer': 'respiratory diseases',
        'context': 'This was something I was quite interested in \\u2013 the '
                   'study of respiratory diseases. There \\nhad been massive '
                   'emigration from the Rhondda with the w'},
    {   'answer': 'how to approach Congress people',
        'context': 'g aides,\\nwho took me in hand and gave me a short course '
                   'on how to approach Congress people.\\nSo I got, called all '
                   'the other project directors, told t'},
    {   'answer': 'hereditary hemolytic\\nanemias',
        'context': 'tivities.\\nOver the years I focused my research primarily '
                   'on hereditary hemolytic\\nanemias, specifically '
                   'hemoglobinopathy, but I also had an interest '},
    {   'answer': ', etc., a',
        'context': 'nwent through the building and they were teaching '
                   "undergraduate English, etc., and I\

In [13]:
from haystack.utils import print_answers

# Change `minimum` to `medium` or `all` to raise the level of detail
print_answers(prediction2, details="medium")


Query: What did I study?
Answers:
[   {   'answer': 'respiratory diseases',
        'context': 'This was something I was quite interested in \\u2013 the '
                   'study of respiratory diseases. There \\nhad been massive '
                   'emigration from the Rhondda with the w',
        'score': 0.8970402479171753},
    {   'answer': 'how to approach Congress people',
        'context': 'g aides,\\nwho took me in hand and gave me a short course '
                   'on how to approach Congress people.\\nSo I got, called all '
                   'the other project directors, told t',
        'score': 0.8005848526954651},
    {   'answer': 'hereditary hemolytic\\nanemias',
        'context': 'tivities.\\nOver the years I focused my research primarily '
                   'on hereditary hemolytic\\nanemias, specifically '
                   'hemoglobinopathy, but I also had an interest ',
        'score': 0.7782573103904724},
    {   'answer': ', etc., a',
        'context

In [None]:
prediction2 = pipe.run(
    query="What did the person work on?", params={"Retriever": {"top_k": 12}, "Reader": {"top_k": 12}}
)

from pprint import pprint

pprint(prediction2)

In [16]:
from haystack.utils import print_answers

# Change `minimum` to `medium` or `all` to raise the level of detail
print_answers(prediction2, details="medium")


Query: What did the person work on?
Answers:
[   {   'answer': 'stools',
        'context': ' W. Brown]. Everyone\\ncalled him \\u201cStooly\\u201d '
                   'Brown. He worked on stools, but he had a lot '
                   'of\\ncharacter, a lot of interests, and people were ve',
        'score': 0.9801039695739746},
    {   'answer': 'it to me at all o',
        'context': '        "$oid": "633c397c8577ed72bce8d5f5"\n'
                   '    "title": "Carlson, Virgil 2002 -  Office of NIH '
                   'History and Stetten Museum",\n'
                   '    "permalink": "https://',
        'score': 0.9558579921722412},
    {   'answer': 'renal failure',
        'context': ' The biggest worry I\\nhad was my roommate. We were doing '
                   'the work on renal failure together,\\nCraig Canfield and '
                   'I. He had a gun that shot .45 caliber',
        'score': 0.921541690826416},
    {   'answer': 'rsus a more dis',
        'context': ' 