## Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import json
import os
from subprocess import Popen, PIPE, STDOUT

from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever
from haystack.pipeline import ExtractiveQAPipeline

03/06/2021 15:07:58 - INFO - faiss.loader -   Loading faiss with AVX2 support.
03/06/2021 15:07:58 - INFO - faiss.loader -   Loading faiss.
03/06/2021 15:07:59 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


## Load data

In [None]:
data = Path('./data/subjqa')
!ls {data}

books  electronics  grocery  movies  restaurants  tripadvisor


### Electronics

In [None]:
electronics_df = pd.read_csv(data/'electronics/splits/train.csv')
electronics_df.head()

Unnamed: 0,item_id,domain,nn_mod,nn_asp,query_mod,query_asp,q_review_id,q_reviews_id,question,question_subj_level,ques_subj_score,is_ques_subjective,review_id,review,human_ans_spans,human_ans_indices,answer_subj_level,ans_subj_score,is_ans_subjective
0,B006ZS5ATM,electronics,sore,ear,uncomfortable,headphone,adb20314dbbd8196b7e9fb587b78147f,6e1052529424f3a98d303380155c9dde,What do you think about headphone?,5,0.0,False,197e760a49907baeff809b2ccdfe466f,"I had a smaller headset that went on the ear. After a 8 - 10 hour workday, m...",ANSWERNOTFOUND,"(547, 561)",5,0.0,False
1,B0074BW614,electronics,perfect,size,single,complaint,991b7bc677086cc5ad62c997a35873ec,dc3a5caea480bb9e6f2af3fb84f7f2ac,What are complaint of these guys?,1,0.2,False,0e61b7c301c4881e6c53ebd9f678e84f,I really am enjoying my Kindle Fire HD. It does so many things. You can find...,ANSWERNOTFOUND,"(541, 555)",5,0.0,False
2,B001ELJER4,electronics,great,feature,several,feature,2a47f58ba5c0e3a9e7eade75d7db125d,eb8f94f30b27bec871c9256b4f9bccc2,How is the feature?,1,0.0,False,905de243991becdba0c365b595855452,Big complaint: Garmin needs to trim the product line and concentrate on sel...,ANSWERNOTFOUND,"(5262, 5276)",1,0.0,False
3,B00GP4BVTO,electronics,versatile,case,slim,profile,12264045ff398038d51f77c33433f4a9,eb0426464208a41782c22caa0df306ae,How about profile?,2,0.0,False,aafbb171c3904c5d6e6fcd3542b5d8fa,From the elegant box to the soft rubber like finish and everything in betwee...,ANSWERNOTFOUND,"(1091, 1105)",2,0.0,False
4,B001TH7T2U,electronics,digital,signal,perfect,image,81fa6a0ab1005f15b91e06c460f79c67,a5957740dca399a016a9585676aee4f5,Do you have good image quality?,1,0.6,True,86eb84edc027f7230a02b8858e795da1,AmazonBasics are a fantastic value for quality of cable you get. No need to...,ANSWERNOTFOUND,"(478, 492)",5,0.0,False


In [None]:
electronics_df["q_review_id"].nunique()

1295

In [None]:
electronics_df["q_reviews_id"].nunique()

1194

In [None]:
electronics_df["human_ans_spans"].value_counts()

ANSWERNOTFOUND                                                             1399
bass is solid and powerful                                                    3
comfortable                                                                   3
the sound quality is great                                                    3
I especially like the fact that the image is good from almost any angle       2
                                                                           ... 
the volume was just too low                                                   1
The bass is great                                                             1
The instructions were SUPER easy                                              1
per color availability                                                        1
It 's totally plug play and it detects when a device connected                1
Name: human_ans_spans, Length: 888, dtype: int64

In [None]:
electronics_df.shape

(2345, 19)

## Convert to SQuAD format

Need this format

```json
{
    "data": [
        {
            "title": "Beyoncé",
            "paragraphs": [
                {
                    "qas": [
                        {
                            "question": "When did Beyonce start becoming popular?",
                            "id": "56be85543aeaaa14008c9063",
                            "answers": [
                                {
                                    "text": "in the late 1990s",
                                    "answer_start": 269
                                }
                            ],
                            "is_impossible": false
                        }
                        ...
                    ],
                    "context": "Beyoncé ..."
                },
                ...
            ]
        }
    ]
}
```

In [None]:
with open("data/squad/train-v2.0.json", 'r', encoding='utf-8') as f:
    data = json.load(f)

In [None]:
# for ex in data["data"]:
#     for qas in ex["paragraphs"]:
#         for x in qas["qas"]:
#             if x["is_impossible"] == True:
#                 print(x)
#                 print("\n\n")
#                 print(qas["context"])
                
#                 break

In [None]:
row = electronics_df.query("human_ans_spans != 'ANSWERNOTFOUND'").iloc[0]

In [None]:
row

item_id                                                                                     B003VAGXWK
domain                                                                                     electronics
nn_mod                                                                                           sleek
nn_asp                                                                                        keyboard
query_mod                                                                                        solid
query_asp                                                                                          key
q_review_id                                                           73e8277fbf438a7ade8f720ddf8a4f47
q_reviews_id                                                          55576d11e04159c488107b442aaff880
question                                                            How are the keys of the  keyboard?
question_subj_level                                                      

In [None]:
row["review"]

"I was reluctant to try a wireless keyboard, but due to a wire-chomping kitty, decided it was best to go wireless. I'm so glad I did. This keyboard is sleek and stylish. It has a great feel under my fingertips. I was concerned that a wireless keyboard would be &#34;buggy&#34; and not be efficient, but this keyboard is as good as any corded keyboard. It charges easily via USB port and holds a charge for about ten days. The illuminated keys are helpful, if, like me, your eyes aren't as young as they once were. I already had the logitech unifying plug that plugs into my computer for my mouse and touchpad. I turned the keyboard on and the Logitech plug recognized it right away. I highly recommend this keyboard. ANSWERNOTFOUND"

In [None]:
from datasets import load_dataset

squad = load_dataset("squad_v2")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1806.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=963.0, style=ProgressStyle(description_…


Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.57 MiB, post-processed: Unknown size, total: 166.91 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/9cac55034b086140f0649ecb5c604d09d7da2f2f5b73a90caa2e2bcc1f5cac09...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9551051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=800683.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/9cac55034b086140f0649ecb5c604d09d7da2f2f5b73a90caa2e2bcc1f5cac09. Subsequent calls will reuse this data.


In [None]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [None]:

from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(squad["train"])


Unnamed: 0,answers,context,id,question,title
0,"{'answer_start': [475], 'text': ['bamboo books']}","The traditional picture of an orderly series of scripts, each one invented suddenly and then completely displacing the previous one, has been conclusively demonstrated to be fiction by the archaeological finds and scholarly research of the later 20th and early 21st centuries. Gradual evolution and the coexistence of two or more scripts was more often the case. As early as the Shang dynasty, oracle-bone script coexisted as a simplified form alongside the normal script of bamboo books (preserved in typical bronze inscriptions), as well as the extra-elaborate pictorial forms (often clan emblems) found on many bronzes.",5726d6b4708984140094d2f1,What were preserved in typical bronze inscriptions?,Chinese_characters
1,"{'answer_start': [], 'text': []}","The gyromagnetic ratio γ is the constant of proportionality between the frequency ν of nuclear magnetic resonance (or electron paramagnetic resonance for electrons) and the applied magnetic field B: ν = γB. It is difficult to measure gyromagnetic ratios precisely because of the difficulties in precisely measuring B, but the value for protons in water at 7002298150000000000♠25 °C is known to better than one part per million. The protons are said to be ""shielded"" from the applied magnetic field by the electrons in the water molecule, the same effect that gives rise to chemical shift in NMR spectroscopy, and this is indicated by a prime on the symbol for the gyromagnetic ratio, γ′p. The gyromagnetic ratio is related to the shielded proton magnetic moment μ′p, the spin number I (I = 1⁄2 for protons) and the reduced Planck constant.",5a3ae27d3ff257001ab842eb,What ratio is the difference of proportionality between the frequency ν of nuclear magnetic resonance (or electron paramagnetic resonance for electrons) and the applied magnetic field?,Planck_constant
2,"{'answer_start': [453], 'text': ['the whole Bible']}","Although the Adventist churches hold much in common, their theologies differ on whether the intermediate state is unconscious sleep or consciousness, whether the ultimate punishment of the wicked is annihilation or eternal torment, the nature of immortality, whether or not the wicked are resurrected after the millennium, and whether the sanctuary of Daniel 8 refers to the one in heaven or one on earth. The movement has encouraged the examination of the whole Bible, leading Seventh-day Adventists and some smaller Adventist groups to observe the Sabbath. The General Conference of Seventh-day Adventists has compiled that church's core beliefs in the 28 Fundamental Beliefs (1980 and 2005), which use Biblical references as justification.",5731e27cb9d445190005e617,The Adventist movement has encouraged examining what in full?,Protestantism
3,"{'answer_start': [294], 'text': ['lacquer']}","The die is a negative image of the glass master: typically, several are made, depending on the number of pressing mills that are to make the CD. The die then goes into a press, and the physical image is transferred to the blank CD, leaving a final positive image on the disc. A small amount of lacquer is applied as a ring around the center of the disc, and rapid spinning spreads it evenly over the surface. Edge protection lacquer is applied before the disc is finished. The disc can then be printed and packed.",572f67c704bcaa1900d768e6,How is the positive image on a CD protected?,Compact_disc
4,"{'answer_start': [497], 'text': ['civilization, and its art, philosophy, architecture and literature would be instrumental in the formation and development of Western culture.']}","The classical period of Greek civilization covers a time spanning from the early 5th century BC to the death of Alexander the Great, in 323 BC (some authors prefer to split this period into 'Classical', from the end of the Persian wars to the end of the Peloponnesian War, and 'Fourth Century', up to the death of Alexander). It is so named because it set the standards by which Greek civilization would be judged in later eras. The Classical period is also described as the ""Golden Age"" of Greek civilization, and its art, philosophy, architecture and literature would be instrumental in the formation and development of Western culture.",572fb308b2c2fd140056837b,What did the Greeks do that made it possible for the expansion and growth of the opposite of the Eastern Civilization ?,Greeks
5,"{'answer_start': [], 'text': []}","Sunni Islam of the Hanafi school has been officially recognized by the government since 2009. Tajikistan considers itself a secular state with a Constitution providing for freedom of religion. The Government has declared two Islamic holidays, Id Al-Fitr and Idi Qurbon, as state holidays. According to a U.S. State Department release and Pew research group, the population of Tajikistan is 98% Muslim. Approximately 87%–95% of them are Sunni and roughly 3% are Shia and roughly 7% are non-denominational Muslims. The remaining 2% of the population are followers of Russian Orthodoxy, Protestantism, Zoroastrianism and Buddhism. A great majority of Muslims fast during Ramadan, although only about one third in the countryside and 10% in the cities observe daily prayer and dietary restrictions.",5aceec8532bba1001ae4b930,Who has declared three Islamic holidays?,Tajikistan
6,"{'answer_start': [41], 'text': ['more than 10 million volumes']}","Nanjing Library, founded in 1907, houses more than 10 million volumes of printed materials and is the third largest library in China, after the National Library in Beijing and Shanghai Library. Other libraries, such as city-owned Jinling Library and various district libraries, also provide considerable amount of information to citizens. Nanjing University Library is the second largest university libraries in China after Peking University Library, and the fifth largest nationwide, especially in the number of precious collections.",56e7b49000c9c71400d77527,How many volumes does the Nanjing Library have?,Nanjing
7,"{'answer_start': [55], 'text': ['Nazi Germany']}","After the Holocaust, which had been perpetrated by the Nazi Germany and its allies prior to and during World War II, Lemkin successfully campaigned for the universal acceptance of international laws defining and forbidding genocides. In 1946, the first session of the United Nations General Assembly adopted a resolution that ""affirmed"" that genocide was a crime under international law, but did not provide a legal definition of the crime. In 1948, the UN General Assembly adopted the Convention on the Prevention and Punishment of the Crime of Genocide (CPPCG) which defined the crime of genocide for the first time.",57335849d058e614000b5896,In which war-era country was the Holocaust immortalized?,Genocide
8,"{'answer_start': [384], 'text': ['Yerevan's FC Ararat']}","During Soviet rule, Armenian athletes rose to prominence winning plenty of medals and helping the USSR win the medal standings at the Olympics on numerous occasions. The first medal won by an Armenian in modern Olympic history was by Hrant Shahinyan, who won two golds and two silvers in gymnastics at the 1952 Summer Olympics in Helsinki. In football, their most successful team was Yerevan's FC Ararat, which had claimed most of the Soviet championships in the 70s and had also gone to post victories against professional clubs like FC Bayern Munich in the Euro cup.",573245c9e99e3014001e6616,Which Armenian football team was the most successful?,Armenians
9,"{'answer_start': [202], 'text': ['July 1940']}","After the ceasefire following the Fall of France in June 1940, Alsace was annexed to Germany and a rigorous policy of Germanisation was imposed upon it by the Gauleiter Robert Heinrich Wagner. When, in July 1940, the first evacuees were allowed to return, only residents of Alsatian origin were admitted. The last Jews were deported on 15 July 1940 and the main synagogue, a huge Romanesque revival building that had been a major architectural landmark with its 54-metre-high dome since its completion in 1897, was set ablaze, then razed.",5728016b4b864d19001641e7,When were the first evacuees allowed to return?,Strasbourg


## Warmup: no fine-tuning

Let's pick a single category like `Musical_Instruments` and build a `DataFrame` that has `asin`, `context` columns that we can use to create a simple QA system with an existing model fine-tuned on SQuAD:

In [None]:
qid2category = pd.Series(meta_df["category"].values, index=meta_df["qid"]).to_dict()
qid2category[0]

'Toys_and_Games'

In [None]:
qid2asin = pd.Series(meta_df["asin"].values, index=meta_df["qid"]).to_dict()
qid2asin[0]

'B000MP20BU'

It seems that all SQuAD entries are answerable (does this make sense?). What about SQuAD v2 with impossible questions?

In [None]:
qid2isanswer = pd.Series(meta_df["is_answerable"].values, index=meta_df["qid"]).to_dict()
qid2isanswer[4]

0

In [None]:
qid2asin[331392]

'B0057JCYYE'

In [None]:
rows = []

with open(data/'train-qar_squad.jsonl', 'r', encoding='utf-8') as f:
    for _, line in tqdm(enumerate(f)):
        row = json.loads(line)
        qid = row["qas"][0]["id"]
        if qid2category[qid] == "Electronics":
            rows.append((qid2asin[qid], row["context"], row["qas"], qid2isanswer[qid]))

455931it [00:25, 17910.73it/s]


In [None]:
qa_df = pd.DataFrame(rows, columns=['asin', 'text', "qas", 'is_answerable'])
qa_df.head()

Unnamed: 0,asin,text,qas,is_answerable
0,B00009R95M,This is a pretty cool filter. If you spin it around it will totally change t...,"[{'id': 604553, 'is_impossible': False, 'question': 'Does this come with a c...",1
1,B0051GN8GQ,so they fit well and function perfectly as workout headphones. BUT the littl...,"[{'id': 698250, 'is_impossible': False, 'question': 'Will these headphones w...",1
2,B00CQ35HBQ,The memory fit into my dell inspiron 15 laptop. The memory was installed and...,"[{'id': 639762, 'is_impossible': False, 'question': 'I have a new Dell Inspi...",1
3,B00BOYQH44,This is the best camera I have ever owned. I have shot over 800 pictures & h...,"[{'id': 701290, 'is_impossible': False, 'question': 'Does this camera have a...",1
4,B008HODL7K,"Great unit, really can't be beat for the price. Other reviews mentioned unev...","[{'id': 319235, 'is_impossible': False, 'question': 'Does this unit have a C...",1


In [None]:
qa_df['is_answerable'].value_counts()

1    108614
Name: is_answerable, dtype: int64

In [None]:
qa_df.shape

(108614, 4)

In [None]:
qa_df['asin'].nunique()

25301

### Boot ES

In [None]:
! wget -nc https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2

es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )
# wait until ES has started
! sleep 30

In [None]:
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

03/05/2021 15:11:00 - INFO - elasticsearch -   HEAD http://localhost:9200/ [status:200 request:0.094s]
03/05/2021 15:11:00 - INFO - elasticsearch -   HEAD http://localhost:9200/document [status:200 request:0.011s]
03/05/2021 15:11:00 - INFO - elasticsearch -   GET http://localhost:9200/document [status:200 request:0.004s]
03/05/2021 15:11:00 - INFO - elasticsearch -   PUT http://localhost:9200/document/_mapping [status:200 request:0.024s]
03/05/2021 15:11:00 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.002s]


In [None]:
document_store.delete_all_documents()

03/04/2021 21:20:03 - INFO - elasticsearch -   POST http://localhost:9200/document/_delete_by_query [status:200 request:0.970s]


### Index docs

In [None]:
docs = [{"text": row["text"], "meta":{"asin": row["asin"], "is_answerable": row["is_answerable"]}} for _, row in qa_df.iterrows()]
docs[0]

{'text': "Works perfectly and easy to use. Software download also great.The only surprise was that the one I ordered, (USB) doesn't work with an iPad.I was thinking it would work with both PC and iPad. My mistake. I use this with Logic Pro X on an iMac running Mavericks (it's replacing an Mbox) and with a Sony Vaio running Windows 7 and get excellent results (don't forget to install the Windows drivers or you'll run into latency issues). I also use it with the Auria App on my iPad Air. I did appreciate the direct line in switch...I could hear exactly what was being played into the unit without having to route through the computer. That was a nice feature. More recently, I was very happy to get this working with my ipad mini. I did purchase a recommended usb powered hub Belkin model &#34; F4U020&#34; and with that - I'm good to play music into and out of my ipad. Focusrite. An industry standard.I bought this specifically for use with an iPad to do mobile recording. The app I use is Auri

In [None]:
document_store.write_documents(docs)

03/05/2021 15:11:02 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.124s]
03/05/2021 15:11:04 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.382s]
03/05/2021 15:11:05 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.381s]
03/05/2021 15:11:06 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.281s]
03/05/2021 15:11:08 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.237s]
03/05/2021 15:11:09 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.254s]
03/05/2021 15:11:10 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.260s]
03/05/2021 15:11:12 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.275s]


### Retriever

In [None]:
retriever = ElasticsearchRetriever(document_store=document_store)

### Reader

In [None]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, context_window_size=500)

03/06/2021 10:59:24 - INFO - farm.utils -   Using device: CUDA 
03/06/2021 10:59:24 - INFO - farm.utils -   Number of GPUs: 1
03/06/2021 10:59:24 - INFO - farm.utils -   Distributed Training: False
03/06/2021 10:59:24 - INFO - farm.utils -   Automatic Mixed Precision: None
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
03/06/2021 10:59:42 - INFO - farm.utils -   Using device: CUDA 
03/06/2021 10:59:42 - INFO - farm.utils -   Number of GPUs: 1
03/06/2021 10:59:42 - INFO - farm.utils -   Distributed Training: False
03/06/2021 10:59:42 - INFO - farm.utils -   Automatic Mixed Precision: None
03/06/2021 10:59:45 - INFO - farm.infer -   Got ya 7 parallel workers to do inference ...
03/06/2021 10:59:45 - INFO - farm.infer -    0    0    0

In [None]:
# check evaluation on SQuAD v2
reader_eval_results = reader.eval_on_file("data/squad", "dev-v2.0.json", device='cuda')

Preprocessing Dataset data/squad/dev-v2.0.json: 100%|██████████| 1204/1204 [00:07<00:00, 162.32 Dicts/s]
Evaluating: 100%|██████████| 274/274 [02:36<00:00,  1.75it/s]


In [None]:
print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"])
## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
print("Reader Exact Match:", reader_eval_results["EM"])
## Reader F1-Score is the average overlap between the predicted answers and the correct answers
print("Reader F1-Score:", reader_eval_results["f1"])

Reader Top-N-Accuracy: 0.9746483618293608
Reader Exact Match: 0.7843005137707403
Reader F1-Score: 0.8260896852846605


In [None]:
# check evaluation on AmazonQA
reader_eval_results = reader.eval_on_file("data/amazon-qa", "val-qar_squad-music.json", device='cuda')

Preprocessing Dataset data/amazon-qa/val-qar_squad-music.json: 100%|██████████| 1150/1150 [00:03<00:00, 371.15 Dicts/s]
Evaluating: 100%|██████████| 133/133 [01:17<00:00,  1.72it/s]


In [None]:
print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"])
## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
print("Reader Exact Match:", reader_eval_results["EM"])
## Reader F1-Score is the average overlap between the predicted answers and the correct answers
print("Reader F1-Score:", reader_eval_results["f1"])

Reader Top-N-Accuracy: 0.542608695652174
Reader Exact Match: 0.0008695652173913044
Reader F1-Score: 0.0752376647890378


In [None]:
pipe = ExtractiveQAPipeline(reader, retriever)

In [None]:
query = "Is a snare included?"
# DIY drumkit
asin = "B009VDW4OW"
number_of_answers_to_fetch = 3

prediction = pipe.run(query=query, filters={"asin": [asin]}, top_k_retriever=10, top_k_reader=number_of_answers_to_fetch)
print(f"Question: {prediction['query']}")
print("\n")
for i in range(number_of_answers_to_fetch):
    print(f"#{i+1}")
    print(f"Answer: {prediction['answers'][i]['answer']}")
    print(f"ASIN: {prediction['answers'][i]['meta']['asin']}")
    print(f"Is answerable?: {prediction['answers'][i]['meta']['is_answerable']}")
    print(f"Context: {prediction['answers'][i]['context']}")
    print('\n\n')

03/05/2021 14:39:19 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.088s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.25 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.66 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.14 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 15.83 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 15.97 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 15.93 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 18.37 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 18.49 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 15.88 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 15.86 Batches/s]

Question: Is a snare included?


#1
Answer: this one only came with one
ASIN: B009VDW4OW
Is answerable?: 1
Context: the correct sounds out of it. When I slapped the "bass", it would play a "snare" sound combined with the bass. When I slapped the "snare", I would just get a wood sound.I've also seen images that most cajons come with multiple snares... this one only came with one.I'm really not sure what else to say. I wanted a Cajon to play with.but didn't want to pay 100.00 plus. This was a great option, Easy to put together with the limited tools I had on hand. And cheap enough that I wasn't worried to have 



#2
Answer: this one only came with one
ASIN: B009VDW4OW
Is answerable?: 1
Context: the correct sounds out of it. When I slapped the "bass", it would play a "snare" sound combined with the bass. When I slapped the "snare", I would just get a wood sound.I've also seen images that most cajons come with multiple snares... this one only came with one.I'm really not sure what else to




## Fine-tuning

### Converting to the true SQuAD format

One problem with our SQuAD dataset is that it is composed of _line-separated_ JSON instead of the single JSON object that SQuAD traditionally uses. So instead of having examples like 

```json
{
    "context": "blah blah",
    "qas": [
        {
            "id": 331392,
            "is_impossible": false,
            "question": "blah blah?",
            "answers": [
                {
                    "answer_start": 2881,
                    "text": "blah blah"
                },
                ...
            ],
            "human_answers": [
                "blah blah",
                ...
            ]
        }
    ]
}
```

what we really need is a JSON of the form

```json
{
    "data": [
        {
            "title": "Beyoncé",
            "paragraphs": [
                {
                    "qas": [
                        {
                            "question": "When did Beyonce start becoming popular?",
                            "id": "56be85543aeaaa14008c9063",
                            "answers": [
                                {
                                    "text": "in the late 1990s",
                                    "answer_start": 269
                                }
                            ],
                            "is_impossible": false
                        }
                        ...
                    ],
                    "context": "Beyoncé ..."
                },
                ...
            ]
        }
    ]
}
```

Let's write a function that does the conversion for us. To warm-up let's load a single example from the training set:

In [None]:
examples = []

with open(data/"train-qar_squad.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        ex = json.loads(line)
        qid = ex["qas"][0]["id"]
        asin = qid2asin[qid]
        if asin == "B0057JCYYE" or asin == "B00F9ECDRU":
            examples.append(ex)
        if len(examples) > 4:
            break
examples

We don't need the human answers, but we do need the mapping from `qid` to `asin` so that we can collect all questions together that belong to the same product.

In [None]:
asin2qas = {}
seen_asin = set()

for ex in examples:
    qid = ex["qas"][0]["id"]
    asin = qid2asin[qid]
    qas = [{k:v for k,v in ex["qas"][0].items() if k != "human_answers"}]
    par = [{"qas": qas, "context": ex["context"]}]

    if asin in seen_asin:
        asin2qas[asin].extend(par)
    else:
        asin2qas[asin] = par
        seen_asin.add(asin)


# asin2qas

In [None]:
squad_data = []

for k,v in asin2qas.items():
    squad_ex = {}
    squad_ex["title"] = k
    squad_ex["paragraphs"] = v
    squad_data.append(squad_ex)
    
squad_data

In [None]:
squad_dict = {"data": squad_data}

In [None]:
with open(data/"train-qar_squad.json", 'w', encoding='utf-8') as f:
    json.dump(squad_dict, f)

In [None]:
# pick out answer fields
with open(data/"val-qar_squad.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        ex = json.loads(line)
        break

In [None]:
[k for k in ex["qas"][0].keys() if k.startswith("answers")]

['answers_snippet_spans_bleu2',
 'answers_snippet_spans_bleu4',
 'answers_snippet_spans_rouge',
 'answers_sentence_ir',
 'answers_sentence_bleu2',
 'answers_sentence_bleu4']

In [None]:
def convert_to_squad_format(input_file: Path, output_file: Path, category: str = "Musical_Instruments"):
    squad_data = []
    asin2qas = {}
    seen_asin = set()
    answer_fields = ['answers_snippet_spans_bleu2', 'answers_snippet_spans_bleu4',  
                     'answers_snippet_spans_rouge', 'answers_sentence_ir', 
                     'answers_sentence_bleu2',  'answers_sentence_bleu4']
    
    with open(input_file, 'r', encoding='utf-8') as f:
        for _, line in tqdm(enumerate(f)):
            row = json.loads(line)
            qid = row["qas"][0]["id"]
            if qid2category[qid] == category:
                asin = qid2asin[qid]
                qas = [{"answers" if k in answer_fields else k:v for k,v in row["qas"][0].items()}]
                par = [{"qas": qas, "context": row["context"]}]
                
                if asin in seen_asin:
                    asin2qas[asin].extend(par)
                else:
                    asin2qas[asin] = par
                    seen_asin.add(asin)
                    
    for k,v in asin2qas.items():
        squad_ex = {}
        squad_ex["title"] = k
        squad_ex["paragraphs"] = v
        squad_data.append(squad_ex)

    squad_dict = {"data": squad_data}
        
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(squad_dict, f)    

In [None]:
category = "Electronics"
convert_to_squad_format(data/'train-qar_squad.jsonl', data/f'train-qar_squad-{category.lower()}.json', category)

455931it [00:13, 34990.08it/s]


In [None]:
convert_to_squad_format(data/'val-qar_squad.jsonl', data/f'val-qar_squad-{category.lower()}.json', category)

58969it [00:03, 15988.79it/s]


### Load single example

In [None]:
val_df = pd.read_json(data/'train-qar_squad-electronics.json')

In [None]:
val_df

Unnamed: 0,data
0,"{'title': 'B00009R95M', 'paragraphs': [{'qas': [{'id': 604553, 'is_impossibl..."
1,"{'title': 'B0051GN8GQ', 'paragraphs': [{'qas': [{'id': 698250, 'is_impossibl..."
2,"{'title': 'B00CQ35HBQ', 'paragraphs': [{'qas': [{'id': 639762, 'is_impossibl..."
3,"{'title': 'B00BOYQH44', 'paragraphs': [{'qas': [{'id': 701290, 'is_impossibl..."
4,"{'title': 'B008HODL7K', 'paragraphs': [{'qas': [{'id': 319235, 'is_impossibl..."
...,...
25296,"{'title': 'B005LLFY5Y', 'paragraphs': [{'qas': [{'id': 212671, 'is_impossibl..."
25297,"{'title': 'B0053QC0EU', 'paragraphs': [{'qas': [{'id': 596763, 'is_impossibl..."
25298,"{'title': 'B0068PVBLS', 'paragraphs': [{'qas': [{'id': 525680, 'is_impossibl..."
25299,"{'title': 'B009I9MX5Y', 'paragraphs': [{'qas': [{'id': 632546, 'is_impossibl..."


### Fine-tune model

Either something is wrong with my data preparation or getting the model to generalise is _hard_!

In [None]:
reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True, context_window_size=500)

03/06/2021 14:57:00 - INFO - farm.utils -   Using device: CUDA 
03/06/2021 14:57:00 - INFO - farm.utils -   Number of GPUs: 1
03/06/2021 14:57:00 - INFO - farm.utils -   Distributed Training: False
03/06/2021 14:57:00 - INFO - farm.utils -   Automatic Mixed Precision: None
03/06/2021 14:57:10 - INFO - farm.utils -   Using device: CUDA 
03/06/2021 14:57:10 - INFO - farm.utils -   Number of GPUs: 1
03/06/2021 14:57:10 - INFO - farm.utils -   Distributed Training: False
03/06/2021 14:57:10 - INFO - farm.utils -   Automatic Mixed Precision: None
03/06/2021 14:57:11 - INFO - farm.infer -   Got ya 7 parallel workers to do inference ...
03/06/2021 14:57:11 - INFO - farm.infer -    0    0    0    0    0    0    0 
03/06/2021 14:57:11 - INFO - farm.infer -   /w\  /w\  /w\  /w\  /w\  /w\  /w\
03/06/2021 14:57:11 - INFO - farm.infer -   /'\  / \  /'\  /'\  / \  / \  /'\
03/06/2021 14:57:11 - INFO - farm.infer -               


In [None]:
train_data = "data/amazon-qa/"

In [None]:
reader.train(data_dir=train_data, 
             train_filename="train-qar_squad-electronics.json", 
             dev_filename="val-qar_squad-electronics.json", 
             use_gpu=True, n_epochs=1, save_dir="models/haystack/",
             evaluate_every=1000,
             batch_size=64)

03/06/2021 14:57:17 - INFO - farm.utils -   Using device: CUDA 
03/06/2021 14:57:17 - INFO - farm.utils -   Number of GPUs: 1
03/06/2021 14:57:17 - INFO - farm.utils -   Distributed Training: False
03/06/2021 14:57:17 - INFO - farm.utils -   Automatic Mixed Precision: None
Preprocessing Dataset data/amazon-qa/train-qar_squad-electronics.json:  88%|████████▊ | 96000/108614 [01:36<00:11, 1131.67 Dicts/s]Token indices sequence length is longer than the specified maximum sequence length for this model (589 > 512). Running this sequence through the model will result in indexing errors
Preprocessing Dataset data/amazon-qa/train-qar_squad-electronics.json: 100%|██████████| 108614/108614 [01:41<00:00, 1068.48 Dicts/s]
Preprocessing Dataset data/amazon-qa/val-qar_squad-electronics.json: 100%|██████████| 13647/13647 [00:15<00:00, 889.83 Dicts/s] 
03/06/2021 14:59:31 - INFO - farm.modeling.optimization -   Loading optimizer `TransformersAdamW`: '{'correct_bias': False, 'weight_decay': 0.01, 'lr':

In [None]:
new_reader = FARMReader(model_name_or_path="models/haystack")

03/06/2021 12:40:51 - INFO - farm.utils -   Using device: CUDA 
03/06/2021 12:40:51 - INFO - farm.utils -   Number of GPUs: 1
03/06/2021 12:40:51 - INFO - farm.utils -   Distributed Training: False
03/06/2021 12:40:51 - INFO - farm.utils -   Automatic Mixed Precision: None
03/06/2021 12:40:54 - INFO - farm.utils -   Using device: CUDA 
03/06/2021 12:40:54 - INFO - farm.utils -   Number of GPUs: 1
03/06/2021 12:40:54 - INFO - farm.utils -   Distributed Training: False
03/06/2021 12:40:54 - INFO - farm.utils -   Automatic Mixed Precision: None
03/06/2021 12:40:55 - INFO - farm.infer -   Got ya 7 parallel workers to do inference ...
03/06/2021 12:40:55 - INFO - farm.infer -    0    0    0    0    0    0    0 
03/06/2021 12:40:55 - INFO - farm.infer -   /w\  /w\  /w\  /w\  /w\  /w\  /w\
03/06/2021 12:40:55 - INFO - farm.infer -   /'\  / \  /'\  /'\  / \  / \  /'\
03/06/2021 12:40:55 - INFO - farm.infer -               


In [None]:
# eval
reader_eval_results = new_reader.eval_on_file("data/amazon-qa", "val-qar_squad-music.json", device='cuda')

Preprocessing Dataset data/amazon-qa/val-qar_squad-music.json: 100%|██████████| 1150/1150 [00:02<00:00, 390.39 Dicts/s]
Evaluating: 100%|██████████| 133/133 [01:17<00:00,  1.71it/s]


In [None]:
print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"])
## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
print("Reader Exact Match:", reader_eval_results["EM"])
## Reader F1-Score is the average overlap between the predicted answers and the correct answers
print("Reader F1-Score:", reader_eval_results["f1"])

Reader Top-N-Accuracy: 0.7417391304347826
Reader Exact Match: 0.0
Reader F1-Score: 0.0


In [None]:
pipe = ExtractiveQAPipeline(new_reader, retriever)

In [None]:
query = "Is a snare included?"
# DIY drumkit
asin = "B009VDW4OW"
number_of_answers_to_fetch = 3

prediction = pipe.run(query=query, filters={"asin": [asin]}, top_k_retriever=10, top_k_reader=number_of_answers_to_fetch)
print(f"Question: {prediction['query']}")
print("\n")
for i in range(number_of_answers_to_fetch):
    print(f"#{i+1}")
    print(f"Answer: {prediction['answers'][i]['answer']}")
    print(f"ASIN: {prediction['answers'][i]['meta']['asin']}")
    print(f"Is answerable?: {prediction['answers'][i]['meta']['is_answerable']}")
    print(f"Context: {prediction['answers'][i]['context']}")
    print('\n\n')

Traceback (most recent call last):
  File "/root/miniconda3/envs/transformerlab/lib/python3.8/site-packages/urllib3/connection.py", line 156, in _new_conn
    conn = connection.create_connection(
  File "/root/miniconda3/envs/transformerlab/lib/python3.8/site-packages/urllib3/util/connection.py", line 84, in create_connection
    raise err
  File "/root/miniconda3/envs/transformerlab/lib/python3.8/site-packages/urllib3/util/connection.py", line 74, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/root/miniconda3/envs/transformerlab/lib/python3.8/site-packages/elasticsearch/connection/http_urllib3.py", line 245, in perform_request
    response = self.pool.urlopen(
  File "/root/miniconda3/envs/transformerlab/lib/python3.8/site-packages/urllib3/connectionpool.py", line 719, in urlopen
    retries = retries.increment(
  File "/ro

ConnectionError: ConnectionError(<urllib3.connection.HTTPConnection object at 0x7ff914bc2d00>: Failed to establish a new connection: [Errno 111] Connection refused) caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7ff914bc2d00>: Failed to establish a new connection: [Errno 111] Connection refused)

### Evaluation

#### New reader

In [None]:
reader_eval_results = new_reader.eval_on_file(train_data, "val-qar_squad-music.json", device='cuda')

## Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer
print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"])
## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
print("Reader Exact Match:", reader_eval_results["EM"])
## Reader F1-Score is the average overlap between the predicted answers and the correct answers
print("Reader F1-Score:", reader_eval_results["f1"])

Preprocessing Dataset data/amazon-qa/val-qar_squad-music.json: 100%|██████████| 1828/1828 [00:03<00:00, 507.82 Dicts/s]
Evaluating: 100%|██████████| 238/238 [02:16<00:00,  1.74it/s]


Reader Top-N-Accuracy: 0.5
Reader Exact Match: 0.0
Reader F1-Score: 0.0


#### SQuAD reader

In [None]:
reader_eval_results = reader.eval_on_file(train_data, "train-qar_squad-music.json", device='cuda')

## Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer
print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"])
## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
print("Reader Exact Match:", reader_eval_results["EM"])
## Reader F1-Score is the average overlap between the predicted answers and the correct answers
print("Reader F1-Score:", reader_eval_results["f1"])

Preprocessing Dataset data/amazon-qa/train-qar_squad-music.json: 100%|██████████| 2100/2100 [00:03<00:00, 664.52 Dicts/s]
Evaluating: 100%|██████████| 210/210 [02:00<00:00,  1.75it/s]


Reader Top-N-Accuracy: 0.0
Reader Exact Match: 0.0
Reader F1-Score: 0.0
