In [2]:
# Install cell
!pip install -q datasets==2.5.1

!pip install -q apache_beam==2.42.0
#mwparserfromhell

!pip install -q farm-haystack -f https://download.pytorch.org/whl/torch_stable.html

!pip install -q wikipedia==1.4.0

#!pip install elasticsearch==7.9.2

!pip install anvil-uplink

[K     |████████████████████████████████| 431 kB 4.7 MB/s 
[K     |████████████████████████████████| 132 kB 85.8 MB/s 
[K     |████████████████████████████████| 95 kB 5.5 MB/s 
[K     |████████████████████████████████| 182 kB 96.2 MB/s 
[K     |████████████████████████████████| 212 kB 83.9 MB/s 
[K     |████████████████████████████████| 127 kB 90.3 MB/s 
[K     |████████████████████████████████| 131 kB 89.1 MB/s 
[K     |████████████████████████████████| 13.1 MB 4.4 MB/s 
[K     |████████████████████████████████| 278 kB 92.9 MB/s 
[K     |████████████████████████████████| 2.5 MB 80.8 MB/s 
[K     |████████████████████████████████| 26.7 MB 1.1 MB/s 
[K     |████████████████████████████████| 2.7 MB 72.6 MB/s 
[K     |████████████████████████████████| 151 kB 97.1 MB/s 
[K     |████████████████████████████████| 62 kB 1.6 MB/s 
[K     |████████████████████████████████| 526 kB 89.1 MB/s 
[?25h  Building wheel for dill (setup.py) ... [?25l[?25hdone
  Building wheel for docop

In [1]:
# Import cell
import pandas as pd
import numpy as np
from tqdm import tqdm
from pprint import pprint
import wikipedia as wiki
import re
import json
from datasets import load_dataset 
import time


# retrievers used for prediction comparision
from haystack.nodes import (
    BM25Retriever,
    TfidfRetriever
)

from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import FARMReader
from haystack.utils import print_answers
from haystack.document_stores import ElasticsearchDocumentStore

from google.colab import drive
drive.mount('/content/drive')

import anvil.server

Mounted at /content/drive


In [2]:

%%bash
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

#print('Initializing Search Engine')

In [3]:
%%bash --bg 
sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch


In [4]:
def init_model(model_name='deepset/roberta-base-squad2-covid', use_gpu = False):

    return FARMReader(
        model_name_or_path=model_name, #"deepset/roberta-base-squad2"
        use_gpu=use_gpu
    )


In [5]:
# covid qa document store 
def init_context(context = '/content/drive/MyDrive/DeepLearning/COVID-QA.json'):

    with open(context) as jsonfile:
        data = json.load(jsonfile)

    df_covid = pd.DataFrame()
    for i in range(len(data['data'])):
        q = pd.json_normalize(data['data'][i]['paragraphs'][0]['qas'])
        contxt = data['data'][i]['paragraphs'][0]['context']
        d_id = data['data'][i]['paragraphs'][0]['document_id']
        q['context'] = contxt
        q['document_id'] = d_id
        df_covid = pd.concat([df_covid, q])
        
    df_covid.reset_index(drop=True, inplace=True)
    

    df_covid['context_cleaned'] = df_covid.context.apply(
    lambda x: re.sub(r'https?:\/\/.*?[\s+]|\n|[^a-zA-z0-9.]', ' ', x))

    documents_lst_covid = df_covid.to_dict(orient='records')

    # the way document_store wants data formatted
    dicts_covid = [
        {
            'content' : str(elm['context']),
            'meta' : {
                'name' : str(elm['question'])
            }
        } 
        for elm in tqdm(documents_lst_covid)
    ]

    return dicts_covid

In [6]:
def create_docstore(context_dict, port):
    document_store_covid = ElasticsearchDocumentStore(
        #port=port
    )
    document_store_covid.delete_documents()
    document_store_covid.write_documents(
        documents=context_dict
    )

    return document_store_covid

In [7]:
# wiki simple
ds = load_dataset('wikipedia', "20220301.simple")
df_wiki = ds.data['train'].to_pandas()
df_wiki['context_cleaned'] = df_wiki.text.apply(
    lambda x: re.sub(r'https?:\/\/.*?[\s+]|\n|[^a-zA-z0-9.]', ' ', x)
)
documents_lst_wiki = df_wiki.to_dict(orient='records')

dicts_wiki = [
    {
        'content' : elm['text'],
        'meta' : {
            'name' : elm['title']
        }
    } 
    for elm in tqdm(documents_lst_wiki)
]

Downloading builder script:   0%|          | 0.00/35.9k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/30.4k [00:00<?, ?B/s]

Downloading and preparing dataset wikipedia/20220301.simple (download: 228.58 MiB, generated: 224.18 MiB, post-processed: Unknown size, total: 452.76 MiB) to /root/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559...


Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235M [00:00<?, ?B/s]

Dataset wikipedia downloaded and prepared to /root/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 205328/205328 [00:00<00:00, 409161.31it/s]


In [8]:
# create the model instance using covid 19 squad 2 model and wiki API document store
# with 5 document retrieval and 1 answer reader
# using both tf-idf and bm25 retrievers
import time
model = 'deepset/roberta-base-squad2-covid'
reader = init_model(model_name=model, use_gpu = True)
#context = init_context('/content/drive/MyDrive/DeepLearning/COVID-QA.json')
#document_store = create_docstore(context, 9200)
document_store = ElasticsearchDocumentStore(
    port=9200
)
document_store.delete_documents()

retriever_bm25 = BM25Retriever(document_store=document_store)
retriever_tfidf = TfidfRetriever(document_store=document_store)

pipe_covid = ExtractiveQAPipeline(reader, retriever_bm25)
pipe_tfidf = ExtractiveQAPipeline(reader, retriever_tfidf)

  return torch._C._cuda_getDeviceCount() > 0


Downloading config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [9]:
# this code answers questions asked in the front end using wiki API document store and covid 19 squad 2 model.

In [10]:

top_k_reader = 1
top_k_retriever = 5


anvil.server.connect("KN7GHKAZYHJ4GAVZUO3NDX24-D42JGU7YSEXG7ZJH")

@anvil.server.callable
def predict(query):

    try:
        # wikipedia API context prediction 
        search_results_wikiAPI = wiki.search(query)

        wiki_pages = [
            wiki.page(res)
            for res in search_results_wikiAPI[:top_k_reader]
        ]
                
        dicts_wikiAPI = [
            {
                'content' : wiki_page.content,
                'meta' : {
                    'name' : wiki_page.title
                }
            } 
            for wiki_page in tqdm(wiki_pages)
        ]
                

        document_store.delete_documents()
        document_store.write_documents(dicts_wikiAPI)

        retriever_bm25 = BM25Retriever(document_store=document_store)
        retriever_tfidf = TfidfRetriever(document_store=document_store)

        pipe_bm25 = ExtractiveQAPipeline(reader, retriever_bm25)
        pipe_tfidf = ExtractiveQAPipeline(reader, retriever_tfidf)


        prediction_bm25 = pipe_bm25.run(
            query=query,
            params={
                "Retriever" : {"top_k": top_k_retriever},
                "Reader": {"top_k": top_k_reader}
            }
        )

        prediction_tfidf = pipe_tfidf.run(
            query=query,
            params={
                "Retriever" : {"top_k": top_k_retriever},
                "Reader": {"top_k": top_k_reader}
            }
        )

        answer = 'TF-IDF Retriever: ' + str(prediction_tfidf['answers'][0].answer)  + '\n' + 'BM25 Retriever: ' + str(prediction_bm25['answers'][0].answer)
    except:
        answer = 'No result found'

    return answer


anvil.server.wait_forever()

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment" as SERVER


100%|██████████| 1/1 [00:00<00:00,  1.63it/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  2.96 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.58 Batches/s]


KeyboardInterrupt: ignored