# Basic Experiment for QA Pipeline

In [6]:
import pickle
import pandas as pd
import os

In [5]:
#import haystack and FARM utils
from haystack import Finder
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

#initialize elasticsearch docker image
! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2

ModuleNotFoundError: No module named 'haystack.preprocessor'

In [3]:
#initialize document storage
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

10/03/2020 11:43:12 - INFO - elasticsearch -   HEAD http://localhost:9200/document [status:200 request:0.053s]
10/03/2020 11:43:12 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.008s]


In [4]:
#import data
'''processed log-frames'''
with open(os.path.abspath(os.path.join('..', 'data'))+'/logframes_clean.pkl', 'rb') as handle:
    data = pickle.load(handle)

In [5]:
splitted = pd.concat([pd.Series(row['PIMS_ID'], row['full_obj_or_outcome'].split("',"), )              
                    for _, row in data.iterrows()]).reset_index()

splitted = splitted.rename(columns={"index": "text", 0: "PIMS_ID"})

In [6]:
dict_list = []
for i, row in splitted.iterrows():
    write_dicts = {'text': row.text, 'PIMS_ID': row.PIMS_ID}
    dict_list.append(write_dicts)
dicts = tuple(dict_list)

In [12]:
dict_list_2 = []
for i, row in data.iterrows():
    write_dicts = {'text': row.description, 'PIMS_ID': row.PIMS_ID}
    dict_list_2.append(write_dicts)
descriptions = tuple(dict_list_2)

In [24]:
document_store.write_documents(descriptions)

10/03/2020 12:13:16 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.769s]
10/03/2020 12:13:18 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.022s]


In [25]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

In [None]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)

In [26]:
reader.save("models/roberta-temp") 

#load Roberta reader
reader = FARMReader(model_name_or_path="models/roberta-temp", use_gpu=False, max_seq_len=500, doc_stride=50)
finder = Finder(reader, retriever)

# for choosing right pre-trained model:
# https://haystack.deepset.ai/en/docs/readermd#Choosing-the-Right-Model

10/03/2020 12:13:21 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
10/03/2020 12:13:27 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
10/03/2020 12:13:27 - INFO - farm.infer -   Got ya 3 parallel workers to do inference ...
10/03/2020 12:13:27 - INFO - farm.infer -    0    0    0 
10/03/2020 12:13:27 - INFO - farm.infer -   /w\  /w\  /w\
10/03/2020 12:13:27 - INFO - farm.infer -   /'\  / \  /'\
10/03/2020 12:13:27 - INFO - farm.infer -       


In [29]:
prediction = finder.get_answers(question="What system shall be implemented in Serbia?", top_k_retriever=10, top_k_reader=5)

10/03/2020 12:14:40 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.071s]
10/03/2020 12:14:40 - INFO - haystack.retriever.sparse -   Got 10 candidates from retriever
10/03/2020 12:14:40 - INFO - haystack.finder -   Reader is looking for detailed answer in 7836 chars ...
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.30s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.23 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.26 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.29 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.25 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.17 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.24 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.24 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.18 Batches/s]
Inferencing Samples: 100%|█

In [30]:
print_answers(prediction)

{   'answers': [   {   'answer': 'a monitoring, reporting, and verification '
                                 '(MRV) system',
                       'context': 'ement. \r\n'
                                  '\r\n'
                                  'The project will finalize and launch a '
                                  'monitoring, reporting, and verification '
                                  '(MRV) system that will provide more '
                                  'accurate information and',
                       'document_id': 'bbf48259-0b47-4292-a6d5-ed35ef863b0b',
                       'meta': {'PIMS_ID': '6211'},
                       'offset_end': 102,
                       'offset_end_in_doc': 572,
                       'offset_start': 48,
                       'offset_start_in_doc': 518,
                       'probability': 0.7726760525124913,
                       'score': 9.787870407104492},
                   {   'answer': 'a monitoring, reporting, and verifica

# Use Dense Passage Retriever 

In [21]:
'''

Dense Passage Retriever:

    Utilizes BERT to embed both the document and the query to compute a more contextual similarity score for ranking.
    
    Embedding of documents is computationally very expensive and is probably unfeasible without proper GPU support.
    
'''

from haystack.retriever.dense import DensePassageRetriever
retriever = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  use_gpu=True,
                                  embed_title=True,
                                  max_seq_len=256,
                                  batch_size=16,
                                  remove_sep_tok_from_untitled_passages=True)

10/03/2020 11:49:13 - INFO - filelock -   Lock 6379410000 acquired on /Users/jonas/.cache/torch/transformers/8fdd0d2838c23f921379f2b0322aecf406cbdaa97ffecc544e3a1d49a7c302bd.6f90756c59007364d7842118056ad653f39f4d340fbe20bcc04037d2a45cb0f7.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=437986065.0, style=ProgressStyle(descri…

10/03/2020 11:52:44 - INFO - filelock -   Lock 6379410000 released on /Users/jonas/.cache/torch/transformers/8fdd0d2838c23f921379f2b0322aecf406cbdaa97ffecc544e3a1d49a7c302bd.6f90756c59007364d7842118056ad653f39f4d340fbe20bcc04037d2a45cb0f7.lock





Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-question_encoder-single-nq-base and are newly initialized: ['question_encoder.bert_model.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
10/03/2020 11:52:52 - INFO - filelock -   Lock 6377104672 acquired on /Users/jonas/.cache/torch/transformers/d1c705617c02da7a616f4b5a8cb445a7f78e84bc4f9e26378c89901d97e16d78.232fed629becb590e5b2ac6c6124f9d1561ef7a1d17ad0394232dd46a0835002.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=437983985.0, style=ProgressStyle(descri…

10/03/2020 11:56:23 - INFO - filelock -   Lock 6377104672 released on /Users/jonas/.cache/torch/transformers/d1c705617c02da7a616f4b5a8cb445a7f78e84bc4f9e26378c89901d97e16d78.232fed629becb590e5b2ac6c6124f9d1561ef7a1d17ad0394232dd46a0835002.lock





Some weights of DPRContextEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['ctx_encoder.bert_model.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
#update embeddings - do not compile without GPU support. 
document_store.update_embeddings(retriever)

10/03/2020 11:57:29 - INFO - elasticsearch -   POST http://localhost:9200/document/_search?scroll=5m&size=1000 [status:200 request:0.525s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.137s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.121s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.084s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.113s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.104s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.091s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.079s]
10/03/2020 11:57:30 - INFO - elasticsearch

10/03/2020 11:57:37 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.072s]
10/03/2020 11:57:37 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.074s]
10/03/2020 11:57:37 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.067s]
10/03/2020 11:57:37 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.069s]
10/03/2020 11:57:37 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.073s]
10/03/2020 11:57:37 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.073s]
10/03/2020 11:57:38 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.066s]
10/03/2020 11:57:38 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.084s]
10/03/2020 11:57:38 - INFO - elasticsearch -   POST http://local

10/03/2020 11:57:43 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.072s]
10/03/2020 11:57:43 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.082s]
10/03/2020 11:57:43 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.075s]
10/03/2020 11:57:43 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.082s]
10/03/2020 11:57:43 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.066s]
10/03/2020 11:57:44 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.067s]
10/03/2020 11:57:44 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.060s]
10/03/2020 11:57:44 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.056s]
10/03/2020 11:57:44 - INFO - elasticsearch -   POST http://local

KeyboardInterrupt: 

In [None]:
finder = Finder(reader, retriever)

In [None]:
prediction = finder.get_answers(question="What is the MRV system supporting in Serbia?", top_k_retriever=10, top_k_reader=5)