# Load Train Dataset

In [None]:
import csv
import requests

# load training dataset
def load_data():
    CSV_URL = 'https://drive.google.com/u/0/uc?id=1Z-yb752A3o7b9dqrGt24XU0sl53FVqya&export=download'

    with requests.Session() as s:
        download = s.get(CSV_URL)
        decoded_content = download.content.decode('utf-8')
        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        train_data = list(cr)

    print(f"Number of examples = {len(train_data)}")
    ans, noans = 0, 0
    for x in train_data:
        if x[4] == 'False':
            noans += 1
        else:
            ans += 1
    print(f"\tAnswerable questions = {ans}")
    print(f"\tNon-Answerable questions = {noans}\n")
    print("Examples:")
    for i in [0, 1000, 1300]:
        print(' | '.join(train_data[i][:2]), ' | ', train_data[i][2][:20] + '...', ' | ', ' | '.join(train_data[i][3:]))
    return train_data

In [None]:
def load_theme_wise_data(train_data):
    theme_wise_data = {}
    for x in train_data[1:]:
        if x[1] not in theme_wise_data:
            theme_wise_data[x[1]] = {
                'para': [],
                'ques': [],
                'ans': []
            }
        if x[2] not in theme_wise_data[x[1]]['para']:
            theme_wise_data[x[1]]['para'].append(x[2])
        theme_wise_data[x[1]]['ques'].append(x[3])
        # ans contains a list -> [Para_Number, Answer_possible, Answer_text, Answer_start]
        theme_wise_data[x[1]]['ans'].append([theme_wise_data[x[1]]['para'].index(x[2])] + x[4:])
    print(f'\nTotal {len(theme_wise_data)} themes present.')
    return theme_wise_data

In [None]:
def load_ques_by_theme(theme, theme_wise_data, answerable_only = False):
    paras = theme_wise_data[theme]['para']
    ques = []
    gold_para = []
    ans = []
    for i in range(len(theme_wise_data[theme]['ques'])):
        if answerable_only and theme_wise_data[theme]['ans'][i][1] == 'False':
            continue
        ques.append(theme_wise_data[theme]['ques'][i])
        gold_para.append(theme_wise_data[theme]['ans'][i][0])
        ans.append(theme_wise_data[theme]['ans'][i][1:])
    
    print("Total Questions:", len(ques))
    print("Total Paragraphs:", len(paras))
    return paras, ques, gold_para, ans

### Load Test Data

#### Save missing Squad 2.0 dataset for testing

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 

In [None]:
from datasets import load_dataset, load_metric

In [None]:
sqd = load_dataset("squad_v2")

Downloading builder script:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/801k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
sqd

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [None]:
sqd['train'][0]

{'id': '56be85543aeaaa14008c9063',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'When did Beyonce start becoming popular?',
 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

In [None]:
data = load_data()

Number of examples = 75056
	Answerable questions = 50126
	Non-Answerable questions = 24930

Examples:
 | Theme  |  Paragraph...  |  Question | Answer_possible | Answer_text | Answer_start
1430 | Frédéric_Chopin  |  Some modern commenta...  |  Who said Chopin's works were modeled after Bach, Beethoven, Schubert and Field? | True | ['Richard Taruskin'] | [543]
2196 | The_Legend_of_Zelda:_Twilight_Princess  |  Twilight Princess ta...  |  Who releases Bulbins from the Realm of Twilight? | False | [] | []


In [None]:
print(len(data), len(sqd['train']), len(sqd['validation']))

75056 130319 11873


In [None]:
gd, sdt, sdv = {}, {}, {}

for x in data[1:]:
    theme = x[1]
    if theme not in gd:
        gd[theme] = []
    gd[theme].append(tuple(x[1:]))

for x in sqd['train']:
    theme = x['title']
    if theme not in sdt:
        sdt[theme] = []
    y = [x['id'], x['title'], x['context'], x['question'], 'True' if x['answers']['text'] != [] else 'False', str(x['answers']['text']), str(x['answers']['answer_start'])]
    sdt[theme].append(tuple(y[1:]))

for x in sqd['validation']:
    theme = x['title']
    if theme not in sdv:
        sdv[theme] = []
    y = [x['id'], x['title'], x['context'], x['question'], 'True' if x['answers']['text'] != [] else 'False', str(x['answers']['text']), str(x['answers']['answer_start'])]
    sdv[theme].append(tuple(y[1:]))

In [None]:
print(len(gd.keys()), len(sdt.keys()), len(sdv.keys()))

361 442 35


In [None]:
r1_test, r2_test = [], []

for theme in sdt:
    if theme not in gd:
        r1_test += sdt[theme]

for theme in sdv:
    if theme not in gd:
        r1_test += sdv[theme]

for theme in gd:
    if theme in sdt:
        q1 = set(gd[theme])
        q2 = set(sdt[theme])
        r2_test += list(q2.difference(q1))
    if theme in sdv:
        q1 = set(gd[theme])
        q2 = set(sdv[theme])
        r2_test += list(q2.difference(q1))

In [None]:
print(len(r1_test), len(r2_test))

34927 32402


In [None]:
r1_test[1000]

('Wayback_Machine',
 'In Europe the Wayback Machine could be interpreted as violating copyright laws. Only the content creator can decide where their content is published or duplicated, so the Archive would have to delete pages from its system upon request of the creator. The exclusion policies for the Wayback Machine may be found in the FAQ section of the site. The Wayback Machine also retroactively respects robots.txt files, i.e., pages that currently are blocked to robots on the live web temporarily will be made unavailable from the archives as well.',
 'What may be found in the robots.txt files section of the site?',
 'False',
 '[]',
 '[]')

In [None]:
r1_test = [[i] + list(data) for i, data in enumerate(r1_test)]
r2_test = [[i] + list(data) for i, data in enumerate(r2_test)]

In [None]:
import csv

with open('test_data_round_1.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerows(r1_test)

with open('test_data_round_2.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerows(r2_test)

In [None]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [None]:
!cp test_data_round_1.csv "/content/gdrive/MyDrive/DevRev Test Data"
!cp test_data_round_2.csv "/content/gdrive/MyDrive/DevRev Test Data"

#### Load Test Data

In [None]:
import csv
import requests

# load test dataset
def load_test_data():
    CSV_URL_R1 = 'https://drive.google.com/u/0/uc?id=1-56-cMKze05gTCtKjItBsxiXlPel4tpi&export=download'
    CSV_URL_R2 = 'https://drive.google.com/u/0/uc?id=1-8_iovhHzNEHjvnpzp-I9BfrozCHS0NQ&export=download'

    with requests.Session() as s:
        download = s.get(CSV_URL_R1)
        decoded_content = download.content.decode('utf-8')
        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        test_data_r1 = list(cr)

        download = s.get(CSV_URL_R2)
        decoded_content = download.content.decode('utf-8')
        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        test_data_r2 = list(cr)

    for r, d in zip(['Round 1', 'Round 2'], [test_data_r1, test_data_r2]):
        print(r)
        print(f"Number of examples = {len(d)}")
        ans, noans = 0, 0
        for x in d:
            if x[4] == 'False':
                noans += 1
            else:
                ans += 1
        print(f"\tAnswerable questions = {ans}")
        print(f"\tNon-Answerable questions = {noans}\n")
        print("Examples:")
        for i in [0, 1000]:
            print(' | '.join(d[i][:2]), ' | ', d[i][2][:20] + '...', ' | ', ' | '.join(d[i][3:]))
        print()
    
    return test_data_r1, test_data_r2

#### Load Train and Test data v2

In [None]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [None]:
r1, r2 = load_test_data()

Round 1
Number of examples = 34927
	Answerable questions = 20858
	Non-Answerable questions = 14069

Examples:
0 | IPod  |  The iPod is a line o...  |  Which company produces the iPod? | True | ['Apple'] | [105]
1000 | Wayback_Machine  |  In Europe the Waybac...  |  What may be found in the robots.txt files section of the site? | False | [] | []

Round 2
Number of examples = 32402
	Answerable questions = 21919
	Non-Answerable questions = 10483

Examples:
0 | Beyoncé  |  On January 7, 2012, ...  |  Jay Z has a website called what? | True | ['Lifeandtimes.com'] | [216]
1000 | New_York_City  |  The Queensboro Bridg...  |  The Queensboro Bridge utilized what type of construction? | True | ['cantilever'] | [47]



In [None]:
tr = load_theme_wise_data(r2)


Total 361 themes present.


In [None]:
paras, ques, themes, ans = [], [], [], []
pid, qid = 1, 1
for theme in tr:
    pstart, qstart = pid, qid
    for p in tr[theme]['para']:
        paras.append([pid, p, theme])
        pid += 1
    for q, a in zip(tr[theme]['ques'], tr[theme]['ans']):
        ques.append([qid, q, theme])
        if a[1] == 'True':
            ans.append([qid, [pstart + a[0]], a[2]])
        else:
            ans.append([qid, [], a[2]])
        qid += 1
    themes.append([theme, qstart, qid-1])

with open('input_paragraph.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(['id', 'paragraph', 'theme'])
    csvwriter.writerows(paras)

with open('input_question.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(['id', 'question', 'theme'])
    csvwriter.writerows(ques)

with open('theme_interval.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(['theme', 'start', 'end'])
    csvwriter.writerows(themes)

with open('ground_truth.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(['question_id', 'paragraph_id', 'answers'])
    csvwriter.writerows(ans)

In [None]:
# !cp input_paragraph.csv "/content/gdrive/MyDrive/DevRev Test Data/R2"
# !cp input_question.csv "/content/gdrive/MyDrive/DevRev Test Data/R2"
# !cp theme_interval.csv "/content/gdrive/MyDrive/DevRev Test Data/R2"
!cp ground_truth.csv "/content/gdrive/MyDrive/DevRev Test Data/R2"

cp: failed to access '/content/gdrive/MyDrive/DevRev Test Data/R2': Not a directory


# HayStack Paragraph Retrieval

As k increases, the retrieval time increases substantially. For k = 3, on CPU it is around 640 ms for 750 passages put in the document store. Passages were of various themes in this case.

When tested on a single theme (50 passages in document store), the results were as follows:

* k = 1, Accuracy = 34.541984732824424
* k = 2, Accuracy = 46.18320610687023
* k = 3, Accuracy = 53.05343511450382
* k = 4, Accuracy = 58.01526717557252
* k = 5, Accuracy = 61.25954198473282
* k = 6, Accuracy = 62.40458015267175
* k = 7, Accuracy = 63.74045801526718
* k = 8, Accuracy = 65.2671755725191
* k = 9, Accuracy = 65.83969465648855
* k = 10, Accuracy = 66.98473282442748

Here accuracy is the number of cases in which the gold paragraph was in the top_k results.

In [None]:
!pip install --upgrade pip
!pip install farm-haystack[colab]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
Successfully installed pip-22.3.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting farm-haystack[colab]
  Downloading farm_haystack-1.12.2-py3-none-any.whl (598 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m598.7/598.7 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m48.1 MB/s[0m eta [36m

In [None]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [None]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
import time
time.sleep(30)

In [None]:
from haystack.utils import launch_es
launch_es()



In [None]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index="document",
    create_index=True,
    similarity="dot_product"
)

In [None]:
train_data = load_data()
theme_wise_data = load_theme_wise_data(train_data)

ERROR:posthog:error uploading: [PostHog] <html>
<head><title>503 Service Temporarily Unavailable</title></head>
<body>
<center><h1>503 Service Temporarily Unavailable</h1></center>
</body>
</html>
 (503)


Number of examples = 75056
	Answerable questions = 50126
	Non-Answerable questions = 24930

Examples:
 | Theme  |  Paragraph...  |  Question | Answer_possible | Answer_text | Answer_start
1430 | Frédéric_Chopin  |  Some modern commenta...  |  Who said Chopin's works were modeled after Bach, Beethoven, Schubert and Field? | True | ['Richard Taruskin'] | [543]
2196 | The_Legend_of_Zelda:_Twilight_Princess  |  Twilight Princess ta...  |  Who releases Bulbins from the Realm of Twilight? | False | [] | []

Total 361 themes present.


In [None]:
theme_documents = []
themes = [
    'Beyoncé', 'Spectre_(2015_film)', 'New_York_City', 'To_Kill_a_Mockingbird', 'Solar_energy', 'Buddhism', 'American_Idol', 'Dog',
    '2008_Summer_Olympics_torch_relay', 'Genome', 'Comprehensive_school', 'Prime_minister', 'Institute_of_technology', 'Hydrogen',
    'Separation_of_powers_under_the_United_States_Constitution', 'Architecture', 'Alexander_Graham_Bell', 'Matter'
]
for theme in themes:
    paras = theme_wise_data[theme]['para'][:50]
    for para in paras:
        theme_documents.append({
                          'content': para,
                          'meta': {'theme': theme},
        })
    break # to take only the first theme
document_store.write_documents(theme_documents)

In [None]:
from haystack.nodes import DensePassageRetriever

retriever = DensePassageRetriever(
    document_store=document_store,
    use_gpu=True,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base"
)

document_store.update_embeddings(retriever)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
ERROR:posthog:error uploading: [PostHog] <html>
<head><title>503 Service Temporarily Unavailable</title></head>
<body>
<center><h1>503 Service Temporarily Unavailable</h1></center>
</body>
</html>
 (503)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

ERROR:posthog:error uploading: [PostHog] <html>
<head><title>503 Service Temporarily Unavailable</title></head>
<body>
<center><h1>503 Service Temporarily Unavailable</h1></center>
</body>
</html>
 (503)


Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

INFO:haystack.modeling.model.language_model:Auto-detected model language: english


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.document_stores.search_engine:Updating embeddings for all 50 docs ...


Updating embeddings:   0%|          | 0/50 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/64 [00:00<?, ? Docs/s]

In [None]:
from haystack.pipelines import DocumentSearchPipeline
from haystack.utils import print_documents

# Build and execute the query pipeline.
pipeline = DocumentSearchPipeline(retriever)

In [None]:
for k in range(1, 11):
    tot = 0
    score = 0
    for row in train_data:
        if row[0] != '' and row[1] != themes[0]:
            break
        tot += 1
        if row[5] == '[]':
            # If unanswerable, positive score.
            score += 1
            continue
        gold_para, que = row[2], row[3]
        result = pipeline.run(que, params={"Retriever": {"top_k": k}})
        contents = []
        for doc in result["documents"]:
            content = doc.content
            contents.append(content)
        if gold_para in contents:
            # Found gold para in the top_k results.
            score += 1
        else:
            score += 0
        # print_documents(result, max_text_len=100, print_name=True, print_meta=True)
    print(f"k = {k}, Accuracy = {score/tot*100}")

k = 1, Accuracy = 34.541984732824424
k = 2, Accuracy = 46.18320610687023
k = 3, Accuracy = 53.05343511450382
k = 4, Accuracy = 58.01526717557252
k = 5, Accuracy = 61.25954198473282
k = 6, Accuracy = 62.40458015267175
k = 7, Accuracy = 63.74045801526718
k = 8, Accuracy = 65.2671755725191
k = 9, Accuracy = 65.83969465648855
k = 10, Accuracy = 66.98473282442748
