In [1]:
import logging

logging.basicConfig(
    format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING
)
logging.getLogger("haystack").setLevel(logging.INFO)

In [2]:
from haystack.utils import launch_es

launch_es()

In [4]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_index = "document"
document_store = ElasticsearchDocumentStore(
    host=host, username="", password="", index=document_index
)

In [6]:
from haystack.utils import fetch_archive_from_http

doc_dir = "data/test_tables"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/table_text_dataset.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

INFO - haystack.utils.import_utils -  Fetching from https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/table_text_dataset.zip to 'data/test_tables'


True

In [7]:
import json
from haystack import Document
import pandas as pd


def read_tables(filename):
    processed_tables = []
    with open(filename) as tables:
        tables = json.load(tables)
        for key, table in tables.items():
            current_columns = table["header"]
            current_rows = table["data"]
            current_df = pd.DataFrame(columns=current_columns, data=current_rows)
            document = Document(content=current_df, content_type="table", id=key)
            processed_tables.append(document)

    return processed_tables


tables = read_tables(f"{doc_dir}/tables.json")
document_store.write_documents(tables, index=document_index)

print(tables[0].content)
print(tables[0].meta)

                Opponent    M    W    L  T  NR   Win% First  Last
0            Afghanistan    2    2    0  0   0  100.0  2012  2014
1              Australia   98   32   62  1   3  34.21  1975  2017
2             Bangladesh   35   31    4  0   0  88.57  1986  2015
3                 Canada    2    2    0  0   0  100.0  1979  2011
4                England   82   31   49  0   2  38.75  1974  2017
5              Hong Kong    2    2    0  0   0  100.0  2004  2008
6                  India  129   73   52  0   4   58.4  1978  2017
7                Ireland    7    5    1  1   0  78.57  2007  2016
8                  Kenya    6    6    0  0   0  100.0  1996  2011
9                Namibia    1    1    0  0   0  100.0  2003  2003
10           Netherlands    3    3    0  0   0  100.0  1996  2003
11           New Zealand  103   53   47  1   2  52.97  1973  2018
12              Scotland    3    3    0  0   0  100.0  1999  2013
13          South Africa   73   25   47  0   1  34.72  1992  2017
14        

In [8]:
from haystack.nodes.retriever import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store, embedding_model="deepset/all-mpnet-base-v2-table"
)

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model deepset/all-mpnet-base-v2-table


Downloading (…)9b424/.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)5a16e9b424/README.md:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Downloading (…)16e9b424/config.json:   0%|          | 0.00/576 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)9b424/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading (…)5a16e9b424/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)6e9b424/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [9]:
document_store.update_embeddings(retriever=retriever)

INFO - haystack.document_stores.search_engine -  Updating embeddings for all 3235 docs ...


Updating embeddings:   0%|          | 0/3235 [00:00<?, ? Docs/s]

Batches:   0%|          | 0/102 [00:00<?, ?it/s]

In [10]:
retrieved_tables = retriever.retrieve("Who won the Super Bowl?", top_k=5)

print(retrieved_tables[0].content)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

                     Year                   Coach              Super Bowl   
0                    1966          Vince Lombardi                       I  \
1                    1967          Vince Lombardi                      II   
2                    1996           Mike Holmgren                    XXXI   
3                    2010           Mike McCarthy                     XLV   
4  Total Super Bowls won:  Total Super Bowls won:  Total Super Bowls won:   

                  Location                Opponent  Score Record  
0  Los Angeles, California      Kansas City Chiefs  35–10   12–2  
1           Miami, Florida         Oakland Raiders  33–14  9–4–1  
2   New Orleans, Louisiana    New England Patriots  35–21   13–3  
3         Arlington, Texas     Pittsburgh Steelers  31–25   10–6  
4   Total Super Bowls won:  Total Super Bowls won:      4      4  


In [11]:
from haystack.nodes import TableReader

reader = TableReader(
    model_name_or_path="google/tapas-base-finetuned-wtq", max_seq_len=512
)

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/262k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/490 [00:00<?, ?B/s]

In [12]:
table_doc = document_store.get_document_by_id("36964e90-3735-4ba1-8e6a-bec236e88bb2")
print(table_doc.content)

                  Name        Program           Role       Salary     Year   
0         Simon Cowell   The X Factor          Judge  $75 million  2012–13  \
1       Britney Spears  American Idol    $25 million      2017–18     [15]   
2       Jennifer Lopez    $20 million        2011–12         [16]      nan   
3         Mariah Carey    $18 million        2012–13         [17]      nan   
4          Hugh Laurie          House  Gregory House  $15 million     2013   
5        Ryan Seacrest  American Idol           Host      2013–16     [14]   
6           Katy Perry   The X Factor          Judge         2012     [17]   
7          Miley Cyrus      The Voice          Coach  $13 million  2016–17   
8          Adam Levine        2016–18           [18]          nan      nan   
9        Blake Shelton        2016–18           [18]          nan      nan   
10  Christina Aguilera  $12.5 million           2013         [19]      nan   
11      Kelly Clarkson    $12 million           2018         [20



In [13]:
from haystack.utils import print_answers

prediction = reader.predict(
    query="Who played Gregory House in the series House?", documents=[table_doc]
)
print_answers(prediction, details="all")

'Query: Who played Gregory House in the series House?'
'Answers:'
[   <Answer {'answer': 'Hugh Laurie', 'type': 'extractive', 'score': 1.0, 'context':                   Name        Program           Role       Salary     Year   
0         Simon Cowell   The X Factor          Judge  $75 million  2012–13  \
1       Britney Spears  American Idol    $25 million      2017–18     [15]   
2       Jennifer Lopez    $20 million        2011–12         [16]      nan   
3         Mariah Carey    $18 million        2012–13         [17]      nan   
4          Hugh Laurie          House  Gregory House  $15 million     2013   
5        Ryan Seacrest  American Idol           Host      2013–16     [14]   
6           Katy Perry   The X Factor          Judge         2012     [17]   
7          Miley Cyrus      The Voice          Coach  $13 million  2016–17   
8          Adam Levine        2016–18           [18]          nan      nan   
9        Blake Shelton        2016–18           [18]          nan    

In [14]:
print(f"Predicted answer: {prediction['answers'][0].answer}")
print(f"Meta field: {prediction['answers'][0].meta}")

Predicted answer: Hugh Laurie
Meta field: {'aggregation_operator': 'NONE', 'answer_cells': ['Hugh Laurie']}


In [15]:
from haystack import Pipeline

table_qa_pipeline = Pipeline()
table_qa_pipeline.add_node(
    component=retriever, name="EmbeddingRetriever", inputs=["Query"]
)
table_qa_pipeline.add_node(
    component=reader, name="TableReader", inputs=["EmbeddingRetriever"]
)

In [16]:
prediction = table_qa_pipeline.run(
    "When was Guilty Gear Xrd : Sign released?", params={"top_k": 30}
)
print_answers(prediction, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



'Query: When was Guilty Gear Xrd : Sign released?'
'Answers:'
[   {   'answer': 'December 16 , 2014',
        'context':                                         Title       First release   
0                      Guilty Gear Xrd : Sign  December 16 , 2014  \
1          BlazBlue : Chrono Phantasma Extend      June 30 , 2015   
2         Aegis of Earth : Protonovus Assault     March 15 , 2016   
3                  BlazBlue : Central Fiction    October 6 , 2016   
4     Chronicles of Teddy : Harmony of Exidus     March 29 , 2016   
5                 Guilty Gear Xrd : Revelator       June 7 , 2016   
6   Exist Archive : The Other Side of the Sky   October 18 , 2016   
7                     Guilty Gear Xrd : Rev 2       May 25 , 2017   
8      Under Night In-Birth Exe : Late [ st ]           Late 2017   
9                   School Girl/Zombie Hunter           Late 2017   
10                           Tokyo Xanadu eX+                2017   
11       Code : Realize ~Bouquet of Rainbows~      

In [17]:
def read_texts(filename):
    processed_passages = []
    with open(filename) as passages:
        passages = json.load(passages)
        for key, content in passages.items():
            document = Document(content=content, content_type="text", id=key)
            processed_passages.append(document)

    return processed_passages


passages = read_texts(f"{doc_dir}/texts.json")
document_store.write_documents(passages, index=document_index)



In [18]:
from haystack.nodes import FARMReader, RouteDocuments, JoinAnswers

text_reader = FARMReader("deepset/roberta-base-squad2")
table_reader = TableReader("deepset/tapas-large-nq-hn-reader")
route_documents = RouteDocuments()
join_answers = JoinAnswers()

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.model.language_model -   * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
INFO - haystack.modeling.model.language_model -  Auto-detected model language: english
INFO - haystack.modeling.model.language_model -  Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

In [19]:
text_table_qa_pipeline = Pipeline()
text_table_qa_pipeline.add_node(
    component=retriever, name="EmbeddingRetriever", inputs=["Query"]
)
text_table_qa_pipeline.add_node(
    component=route_documents, name="RouteDocuments", inputs=["EmbeddingRetriever"]
)
text_table_qa_pipeline.add_node(
    component=text_reader, name="TextReader", inputs=["RouteDocuments.output_1"]
)
text_table_qa_pipeline.add_node(
    component=table_reader, name="TableReader", inputs=["RouteDocuments.output_2"]
)
text_table_qa_pipeline.add_node(
    component=join_answers, name="JoinAnswers", inputs=["TextReader", "TableReader"]
)

In [20]:
predictions = text_table_qa_pipeline.run(query="Who was Thomas Alva Edison?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

In [21]:
print_answers(predictions, details="minimum")

'Query: Who was Thomas Alva Edison?'
'Answers:'
[   {   'answer': 'John Béchervaise , OAM , MBE',
        'context':                             Name   
0                 Amanda Barnard  \
1                 Martin G. Bean   
2                Gordon S. Brown   
3   John Béchervaise , OAM , MBE   
4               Megan Clark , AC   
5          J. Donald R. de Raadt   
6              Graham Dorrington   
7             Dennis Gibson , AO   
8              Ranulph Glanville   
9              Alfred Gottschalk   
10         Ann Henderson-Sellers   
11                Arthur R. Hogg   
12        Kourosh Kalantar-zadeh   
13                 Richard Kaner   
14                Lakshmi Kantam   
15                William Kernot   
16             Sir Albert Kitson   
17                   David Malin   
18           Henry Millicer , AM   
19                Luca Marmorini   

                           Association with RMIT   
0             B Sci ( AppPhysics ) ( Hon ) , PhD  \
1                     

In [22]:
predictions = text_table_qa_pipeline.run(
    query="Which country does the film Macaroni come from?"
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



In [23]:
print_answers(predictions, details="minimum")

'Query: Which country does the film Macaroni come from?'
'Answers:'
[   {   'answer': 'Italian',
        'context':    Submitting country Film title used in nomination        Language ( s )   
0           Argentina            The Official Story               Spanish  \
1             Austria                       Malambo                German   
2             Belgium                          Dust                French   
3              Canada          Jacques and November                French   
4      Czechoslovakia              Scalpel , Please                 Czech   
5             Denmark               Twist and Shout                Danish   
6              France        Three Men and a Cradle                French   
7        West Germany                 Angry Harvest                German   
8             Hungary                  Colonel Redl                German   
9             Iceland                   Deep Winter             Icelandic   
10              India                

In [24]:
from haystack import Label, MultiLabel, Answer


def read_labels(filename, tables):
    processed_labels = []
    with open(filename) as labels:
        labels = json.load(labels)
        for table in tables:
            if table.id not in labels:
                continue
            label = labels[table.id]
            label = Label(
                query=label["query"],
                document=table,
                is_correct_answer=True,
                is_correct_document=True,
                answer=Answer(answer=label["answer"]),
                origin="gold-label",
            )
            processed_labels.append(MultiLabel(labels=[label]))
    return processed_labels


table_labels = read_labels(f"{doc_dir}/labels.json", tables)
passage_labels = read_labels(f"{doc_dir}/labels.json", passages)

In [25]:
eval_results = text_table_qa_pipeline.eval(
    table_labels + passage_labels, params={"top_k": 10}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

In [26]:
print(eval_results.calculate_metrics())

{'EmbeddingRetriever': {'recall_multi_hit': 0.3, 'recall_single_hit': 0.3, 'precision': 0.030000000000000006, 'map': 0.25833333333333336, 'mrr': 0.25833333333333336, 'ndcg': 0.2678103593554011}, 'TextReader': {'exact_match': 0.0, 'f1': 0.03666666666666667, 'num_examples_for_eval': 20.0}, 'TableReader': {'exact_match': 0.05, 'f1': 0.07857142857142858, 'num_examples_for_eval': 20.0}, 'JoinAnswers': {'exact_match': 0.05, 'f1': 0.09857142857142857, 'num_examples_for_eval': 20.0}}
