In [None]:
!pip install -q datasets==2.5.1

!pip install -q apache_beam==2.42.0
#mwparserfromhell

!pip install -q farm-haystack -f https://download.pytorch.org/whl/torch_stable.html

!pip install -q wikipedia==1.4.0

### Imports

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from pprint import pprint
import wikipedia as wiki
import re

In [None]:
import json
from datasets import load_dataset 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Evaluation dataset

In [None]:
# evaluation data set 1: covid qa deepset, scientific annotated data used to train deepset/roberta-base-squad2-covid
d_covid = load_dataset('covid_qa_deepset')
QA_covid = d_covid.data['train'].to_pandas()
dataQA = {'question': QA_covid['question'][:100],
          'answer':[d.get('text')[0] for d in QA_covid[:100].answers],
          'wrong_answer': [''] * 100} 
qa_dataset = pd.DataFrame(dataQA)

# evaluation data set 2: qovid qa dataset created using news platforms around the world
#qa_dataset = pd.read_csv('/content/drive/MyDrive/DeepLearning/news.csv')
#qa_dataset = qa_dataset[['question', 'answer', 'wrong_answer']]
#qa_dataset['question'] = 'Covid 19, '+ qa_dataset['question'].astype(str)
#qa_dataset.head(3)

In [None]:
qa_dataset

### Elasticsearch Server

```bash
docker pull docker.elastic.co/elasticsearch/elasticsearch:7.9.2

docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2

```

In [None]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
import time
time.sleep(30)

### Reader imports

In [None]:
# just to display the different retrievers available
from haystack.nodes import (
    BM25Retriever,
    TfidfRetriever,
    DensePassageRetriever,
    FARMReader,
    RAGenerator,
    BaseComponent,
    JoinDocuments,
)

from haystack.pipelines import (
    ExtractiveQAPipeline, 
    DocumentSearchPipeline, 
    GenerativeQAPipeline
)

from haystack.nodes import FARMReader

from haystack.utils import print_answers

# Definition: MODEL, GPU, TOP_K, READER

In [None]:
# CHANGE MODEL NAME FOR DIFFERENT MODELs
# squad 2 model
model = "deepset/roberta-base-squad2"

# squad 2 model trained on covid 19 data
# model = "deepset/roberta-base-squad2-covid"


use_gpu = True

top_k_retriever = 10
top_k_reader = 5


reader = FARMReader(
    model_name_or_path=model, #"deepset/roberta-base-squad2"
    use_gpu=use_gpu
)

# Covid 

### Context: COVID-19 Dataset [Link here](https://github.com/deepset-ai/COVID-QA)

In [None]:
with open('/content/drive/MyDrive/DeepLearning/COVID-QA.json') as jsonfile:
    data = json.load(jsonfile)

# create the dataframe that holds our data
df_covid = pd.DataFrame()
for i in range(len(data['data'])):
    q = pd.json_normalize(data['data'][i]['paragraphs'][0]['qas'])
    contxt = data['data'][i]['paragraphs'][0]['context']
    d_id = data['data'][i]['paragraphs'][0]['document_id']
    q['context'] = contxt
    q['document_id'] = d_id
    df_covid = pd.concat([df_covid, q])
    
df_covid.reset_index(drop=True, inplace=True)
df_covid.head(3)

In [None]:
df_covid['context_cleaned'] = df_covid.context.apply(
    lambda x: re.sub(r'https?:\/\/.*?[\s+]|\n|[^a-zA-z0-9.]', ' ', x)
)

### Document Store

In [None]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
import time
time.sleep(30)

In [None]:
documents_lst_covid = df_covid.to_dict(orient='records')

In [None]:
# the way document_store wants data formatted
dicts_covid = [
    {
        'content' : str(elm['context']),
        'meta' : {
            'name' : str(elm['question'])
        }
    } 
    for elm in tqdm(documents_lst_covid)
]

Write to document store

In [None]:
%%time
from haystack.document_stores import ElasticsearchDocumentStore
document_store_covid = ElasticsearchDocumentStore(
    port=9200
)
document_store_covid.delete_documents()
document_store_covid.write_documents(
    documents=dicts_covid
)

### Retrieving answers

#### BM25

In [None]:
from IPython.display import clear_output
clear_output()

In [None]:
retriever = BM25Retriever(document_store=document_store_covid)

reader = FARMReader(
    model_name_or_path=model, #"deepset/roberta-base-squad2"
    use_gpu=use_gpu
)

pipe_covid = ExtractiveQAPipeline(reader, retriever)

df_res_covid_bm25  = qa_dataset.copy()

# prepare columns for answers
df_res_covid_bm25['predictions_covid_context_bm25'] = [list() for x in range(len(df_res_covid_bm25.index))]

for q_i in tqdm(range(len(qa_dataset.question.tolist()))):

    print('question : ', qa_dataset.question[q_i])

    try:
        # covid dataset context prediction
        prediction_covid = pipe_covid.run(
            query=qa_dataset.question[q_i],
            params={
                "Retriever" : {"top_k": top_k_retriever},
                "Reader": {"top_k": top_k_reader}
            }
        )

        df_res_covid_bm25.loc[q_i, 'predictions_covid_context_bm25'].append(
            [prediction_covid['answers'][k].answer for k in range(len(prediction_covid['answers']))]
        )
    except:
        df_res_covid_bm25.loc[q_i, 'predictions_covid_context_bm25'].append([])

    
df_res_covid_bm25.to_csv('/content/drive/MyDrive/DeepLearning/df_res_covid_bm25_org_data.csv', index=False)

#### RE-INITIALISE ELASTICSEARCH

In [None]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
import time
time.sleep(30)

Write to document store

In [None]:
%%time
from haystack.document_stores import ElasticsearchDocumentStore
document_store_covid = ElasticsearchDocumentStore(
    port=9200
)
document_store_covid.delete_documents()
document_store_covid.write_documents(
    documents=dicts_covid
)

#### TF IDF

In [None]:
retriever_tfidf = TfidfRetriever(document_store=document_store_covid)

reader = FARMReader(
    model_name_or_path=model, #"deepset/roberta-base-squad2"
    use_gpu=use_gpu
)

pipe_covid_tfidf = ExtractiveQAPipeline(reader, retriever_tfidf)

df_res_covid_tfidf = qa_dataset.copy()

# prepare columns for answers
df_res_covid_tfidf['predictions_covid_context_tfidf'] = [list() for x in range(len(df_res_covid_tfidf.index))]

for q_i in tqdm(range(len(qa_dataset.question.tolist()))):

    print('question : ', qa_dataset.question[q_i])

    try:
        # covid dataset context prediction
        prediction_covid = pipe_covid_tfidf.run(
            query=qa_dataset.question[q_i],
            params={
                "Retriever" : {"top_k": top_k_retriever},
                "Reader": {"top_k": top_k_reader}
            }
        )

        df_res_covid_tfidf.loc[q_i, 'predictions_covid_context_tfidf'].append(
            [prediction_covid['answers'][k].answer for k in range(len(prediction_covid['answers']))]
        )
    except:
        df_res_covid_tfidf.loc[q_i, 'predictions_covid_context_tfidf'].append([])


df_res_covid_tfidf.to_csv('/content/drive/MyDrive/DeepLearning/df_res_covid_tfidf_org_data.csv', index=False)

# Wikipedia

In [None]:
## if not already downloaded
# ds = load_dataset('wikipedia', "20220301.simple")
## We take the training data and convert it to a Pandas DataFrame
# df = ds.data['train'].to_pandas()

In [None]:
## if not already downloaded
ds = load_dataset('wikipedia', "20220301.simple")
## We take the training data and convert it to a Pandas DataFrame
df_wiki = ds.data['train'].to_pandas()
df_wiki.head(3)

In [None]:
import re
df_wiki['context_cleaned'] = df_wiki.text.apply(
    lambda x: re.sub(r'https?:\/\/.*?[\s+]|\n|[^a-zA-z0-9.]', ' ', x)
)

### Document Store

In [None]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
import time
time.sleep(30)

In [None]:
documents_lst_wiki = df_wiki.to_dict(orient='records')

In [None]:
# the way document_store wants data formatted
dicts_wiki = [
    {
        'content' : elm['text'],
        'meta' : {
            'name' : elm['title']
        }
    } 
    for elm in tqdm(documents_lst_wiki)
]

In [None]:
%%time
document_store_wiki = ElasticsearchDocumentStore(
    port=9200
)
document_store_wiki.delete_documents()
document_store_wiki.write_documents(
    documents=dicts_wiki
)

### Retrieving answers

#### BM25

In [None]:
retriever = BM25Retriever(document_store=document_store_wiki)

reader = FARMReader(
    model_name_or_path=model, #"deepset/roberta-base-squad2"
    use_gpu=use_gpu
)

pipe_wiki = ExtractiveQAPipeline(reader, retriever)

df_res_wiki_bm25 = qa_dataset.copy()

# prepare columns for answers
df_res_wiki_bm25['predictions_wiki_context_bm25'] = [list() for x in range(len(df_res_wiki_bm25.index))]

for q_i in tqdm(range(len(qa_dataset.question.tolist()))):
    
    print('question : ', qa_dataset.question[q_i])
    
    try:
        # covid dataset context prediction
        prediction_wiki = pipe_wiki.run(
            query=qa_dataset.question[q_i],
            params={
                "Retriever" : {"top_k": top_k_retriever},
                "Reader": {"top_k": top_k_reader}
            }
        )

        df_res_wiki_bm25.loc[q_i, 'predictions_wiki_context_bm25'].append(
            [prediction_wiki['answers'][k].answer for k in range(len(prediction_wiki['answers']))]
        )
    except:
        df_res_wiki_bm25.loc[q_i, 'predictions_wiki_context_bm25'].append([])

df_res_wiki_bm25.to_csv('/content/drive/MyDrive/DeepLearning/df_res_wiki_bm25_org_data.csv', index=False)

#### RE-INITIALISE ELASTICSEARCH

In [None]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
import time
time.sleep(30)

In [None]:
documents_lst_wiki = df_wiki.to_dict(orient='records')

In [None]:
# the way document_store wants data formatted
dicts_wiki = [
    {
        'content' : elm['text'],
        'meta' : {
            'name' : elm['title']
        }
    } 
    for elm in tqdm(documents_lst_wiki)
]

In [None]:
%%time
document_store_wiki = ElasticsearchDocumentStore(
    port=9200
)
document_store_wiki.delete_documents()
document_store_wiki.write_documents(
    documents=dicts_wiki
)

#### TF IDF

In [None]:
retriever = TfidfRetriever(document_store=document_store_wiki)

reader = FARMReader(
    model_name_or_path=model, #"deepset/roberta-base-squad2"
    use_gpu=use_gpu
)

pipe_wiki = ExtractiveQAPipeline(reader, retriever)

df_res_wiki_tfidf = qa_dataset.copy()

# prepare columns for answers
df_res_wiki_tfidf['predictions_wiki_context_tfidf'] = [list() for x in range(len(df_res_wiki_tfidf.index))]

for q_i in tqdm(range(len(qa_dataset.question.tolist()))):
    
    print('question : ', qa_dataset.question[q_i])
    
    try:
        # covid dataset context prediction
        prediction_wiki = pipe_wiki.run(
            query=qa_dataset.question[q_i],
            params={
                "Retriever" : {"top_k": top_k_retriever},
                "Reader": {"top_k": top_k_reader}
            }
        )

        df_res_wiki_tfidf.loc[q_i, 'predictions_wiki_context_tfidf'].append(
            [prediction_wiki['answers'][k].answer for k in range(len(prediction_wiki['answers']))]
        )
    except:
        df_res_wiki_tfidf.loc[q_i, 'predictions_wiki_context_tfidf'].append([])

df_res_wiki_tfidf.to_csv('/content/drive/MyDrive/DeepLearning/df_res_wiki_tfidf_org_data.csv', index=False)

# Wikipedia API

### Document store

In [None]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
import time
time.sleep(30)

In [None]:
%%time
from haystack.document_stores import ElasticsearchDocumentStore

document_store_wikiAPI = ElasticsearchDocumentStore(
    port=9200
)
document_store_wikiAPI.delete_documents()

### Retrieving answers

#### BM25

In [None]:
reader = FARMReader(
    model_name_or_path=model, #"deepset/roberta-base-squad2"
    use_gpu=use_gpu
)

df_res_wikiAPI_bm25 = qa_dataset.copy()

# prepare columns for answers
df_res_wikiAPI_bm25['predictions_wikiAPI_context_bm25'] = [list() for x in range(len(df_res_wikiAPI_bm25.index))]


for q_i in tqdm(range(len(qa_dataset.question.tolist()))):
    
    print('question : ', qa_dataset.question[q_i])
    
    # wikipedia API context prediction 
    try:
        search_results_wikiAPI = wiki.search(qa_dataset.question[q_i])

        wiki_pages = [
            wiki.page(res)
            for res in search_results_wikiAPI[:top_k_reader]
        ]
        
        dicts_wikiAPI = [
            {
                'content' : wiki_page.content,
                'meta' : {
                    'name' : wiki_page.title
                }
            } 
            for wiki_page in tqdm(wiki_pages)
        ]
        
        
        document_store_wikiAPI.delete_documents()
        document_store_wikiAPI.write_documents(dicts_wikiAPI)
        
        retriever_wikiAPI = BM25Retriever(document_store=document_store_wikiAPI)
        
        pipe_wikiAPI = ExtractiveQAPipeline(reader, retriever_wikiAPI)
        
        prediction_wikiAPI = pipe_wikiAPI.run(
            query=qa_dataset.question[q_i],
            params={
                "Retriever" : {"top_k": top_k_retriever},
                "Reader": {"top_k": top_k_reader}
            }
        )

        df_res_wikiAPI_bm25.loc[q_i, 'predictions_wikiAPI_context_bm25'].append(
            [prediction_wikiAPI['answers'][k].answer for k in range(len(prediction_wikiAPI['answers']))]
        )
    except:
        df_res_wikiAPI_bm25.loc[q_i, 'predictions_wikiAPI_context_bm25'].append([])

df_res_wikiAPI_bm25.to_csv('/content/drive/MyDrive/DeepLearning/df_res_wikiAPI_bm25_org_data.csv', index=False)

#### RE-INITIALISE ELASTICSEARCH

In [None]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
import time
time.sleep(30)

In [None]:
document_store_wikiAPI = ElasticsearchDocumentStore(
    port=9200
)
document_store_wikiAPI.delete_documents()

#### TF IDF

In [None]:
reader = FARMReader(
    model_name_or_path=model, #"deepset/roberta-base-squad2"
    use_gpu=use_gpu
)

df_res_wikiAPI_tfidf = qa_dataset.copy()

# prepare columns for answers
df_res_wikiAPI_tfidf['predictions_wikiAPI_context_tfidf'] = [list() for x in range(len(df_res_wikiAPI_tfidf.index))]


for q_i in tqdm(range(len(qa_dataset.question.tolist()))):
    
    print('question : ', qa_dataset.question[q_i])
    
    try:
        # wikipedia API context prediction 
        search_results_wikiAPI = wiki.search(qa_dataset.question[q_i])

        wiki_pages = [
            wiki.page(res)
            for res in search_results_wikiAPI[:top_k_reader]
        ]
        
        dicts_wikiAPI = [
            {
                'content' : wiki_page.content,
                'meta' : {
                    'name' : wiki_page.title
                }
            } 
            for wiki_page in tqdm(wiki_pages)
        ]
        
        
        document_store_wikiAPI.delete_documents()
        document_store_wikiAPI.write_documents(dicts_wikiAPI)
        
        retriever_wikiAPI = TfidfRetriever(document_store=document_store_wikiAPI)
        
        pipe_wikiAPI = ExtractiveQAPipeline(reader, retriever_wikiAPI)
        
        prediction_wikiAPI = pipe_wikiAPI.run(
            query=qa_dataset.question[q_i],
            params={
                "Retriever" : {"top_k": top_k_retriever},
                "Reader": {"top_k": top_k_reader}
            }
        )

        df_res_wikiAPI_tfidf.loc[q_i, 'predictions_wikiAPI_context_tfidf'].append(
            [prediction_wikiAPI['answers'][k].answer for k in range(len(prediction_wikiAPI['answers']))]
        )
    except:
        df_res_wikiAPI_tfidf.loc[q_i, 'predictions_wikiAPI_context_tfidf'].append([])

df_res_wikiAPI_tfidf.to_csv('/content/drive/MyDrive/DeepLearning/df_res_wikiAPI_tfidf_org_data.csv', index=False)

# Combining dataframes 

In [None]:
df_res_covid_tfidf = pd.read_csv('/content/drive/MyDrive/DeepLearning/df_res_covid_tfidf_org_data.csv')
df_res_covid_bm25 = pd.read_csv('/content/drive/MyDrive/DeepLearning/df_res_covid_bm25_org_data.csv')
df_res_wiki_bm25 = pd.read_csv('/content/drive/MyDrive/DeepLearning/df_res_wiki_bm25_org_data.csv')
df_res_wiki_tfidf = pd.read_csv('/content/drive/MyDrive/DeepLearning/df_res_wiki_tfidf_org_data.csv')
df_res_wikiAPI_bm25 = pd.read_csv('/content/drive/MyDrive/DeepLearning/df_res_wikiAPI_bm25_org_data.csv')
df_res_wikiAPI_tfidf = pd.read_csv('/content/drive/MyDrive/DeepLearning/df_res_wikiAPI_tfidf_org_data.csv')

In [None]:
df_res = qa_dataset.copy()

In [None]:
df_res['predictions_covid_context_tfidf'] = df_res_covid_tfidf['predictions_covid_context_tfidf'].copy()
df_res['predictions_covid_context_bm25'] = df_res_covid_bm25['predictions_covid_context_bm25'].copy()
df_res['predictions_wiki_context_bm25'] = df_res_wiki_bm25['predictions_wiki_context_bm25'].copy()
df_res['predictions_wiki_context_tfidf'] = df_res_wiki_tfidf['predictions_wiki_context_tfidf'].copy()
df_res['predictions_wikiAPI_context_bm25'] = df_res_wikiAPI_bm25['predictions_wikiAPI_context_bm25'].copy()
df_res['predictions_wikiAPI_context_tfidf'] = df_res_wikiAPI_tfidf['predictions_wikiAPI_context_tfidf'].copy()

### Save dataframe as csv to drive


In [None]:
df_res.to_csv('/content/drive/MyDrive/DeepLearning/df_res_squad_coviddata_top5.csv', index=False)