# Basic Experiment for QA Pipeline

In [20]:
import pickle
import pandas as pd
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join('..', 'src')))

In [21]:
#import haystack and FARM utils
from haystack import Finder
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

#initialize elasticsearch docker image
! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2

a3642e3992841dfc4f51d2306d25d7684b1bf083adf1f4943f0feb5b1dcbbe1d
docker: Error response from daemon: driver failed programming external connectivity on endpoint heuristic_lewin (166659bc71a9c0d03d04834c2da93cdbee29a8cc66901b790cb47863b3ed1f74): Bind for 0.0.0.0:9200 failed: port is already allocated.


In [11]:
#initialize document storage
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

10/07/2020 18:36:47 - INFO - elasticsearch -   PUT http://localhost:9200/document [status:200 request:0.284s]
10/07/2020 18:36:47 - INFO - elasticsearch -   PUT http://localhost:9200/label [status:200 request:0.163s]


In [32]:
#import data
'''processed log-frames'''
with open(os.path.abspath(os.path.join('..', 'data'))+'/logframes_clean.pkl', 'rb') as handle:
    data = pickle.load(handle)
    
'''descriptions from taxonomy'''
with open(os.path.abspath(os.path.join('..', 'data'))+'/project_description.pkl', 'rb') as handle:
    description = pickle.load(handle)
    description = description.rename(columns={"pims_#": "PIMS_ID"})
    
description.head(2)

Unnamed: 0,PIMS_ID,project_description
2,1584,This programme will contribute to the protection of the biological diversity...
4,1878,The project contributes to the number of GEF projects supported in the agric...


In [33]:
splitted = pd.concat([pd.Series(row['PIMS_ID'], row['full_obj_or_outcome'].split("',"), )              
                    for _, row in data.iterrows()]).reset_index()

splitted = splitted.rename(columns={"index": "text", 0: "PIMS_ID"})

In [34]:
dict_list = []
for i, row in splitted.iterrows():
    write_dicts = {'text': row.text, 'PIMS_ID': row.PIMS_ID}
    dict_list.append(write_dicts)
dicts = tuple(dict_list)

In [36]:
dict_list_2 = []
for i, row in description.iterrows():
    write_dicts = {'text': row.project_description, 'PIMS_ID': row.PIMS_ID}
    dict_list_2.append(write_dicts)
descriptions = tuple(dict_list_2)

In [37]:
document_store.write_documents(descriptions)

10/07/2020 18:47:49 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.706s]
10/07/2020 18:47:50 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.008s]


In [39]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

In [41]:
'''load baseline roberat model from FARM(huggingface also possible):'''

'''uncomment if model is not stored on disk'''
#reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
#reader.save("models/roberta-temp") 

10/07/2020 18:48:30 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
10/07/2020 18:48:30 - INFO - farm.infer -   Could not find `deepset/roberta-base-squad2` locally. Try to download from model hub ...
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
10/07/2020 18:48:46 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
10/07/2020 18:48:46 - INFO - farm.infer -   Got ya 3 parallel workers to do inference ...
10/07/2020 18:48:46 - INFO - farm.infer -    0    0    0 
10/07/2020 18:48:46 - INFO - farm.infer -   /w\  /w\  /w\
10/07/2020 18:48:46 - INFO - farm.infer -   /'\  / \  /'\
10/07/2020 18:48:46 - INFO - farm.infer -       
10/07/2020 18:48:46 - INFO - haystack.reader.farm -   Saving reader model to models/roberta-temp


In [1]:
'''load Roberta reader from disk'''

reader = FARMReader(model_name_or_path="models/roberta-temp", use_gpu=False, max_seq_len=500, doc_stride=50)
# for choosing right pre-trained model:
# https://haystack.deepset.ai/en/docs/readermd#Choosing-the-Right-Model

NameError: name 'FARMReader' is not defined

In [42]:
# put reader and retriever together in pipeline:
finder = Finder(reader, retriever)

In [43]:
prediction = finder.get_answers(question="What system shall be implemented in Serbia?", top_k_retriever=10, top_k_reader=5)

10/07/2020 18:49:47 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.295s]
10/07/2020 18:49:47 - INFO - haystack.retriever.sparse -   Got 10 candidates from retriever
10/07/2020 18:49:47 - INFO - haystack.finder -   Reader is looking for detailed answer in 17281 chars ...
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.11 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.30 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.93s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  2.04 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  2.45 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.91 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.31s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.21 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.70s/ Batches]
Inferencing Samples: 100%|

In [44]:
print_answers(prediction)

{   'answers': [   {   'answer': 'a monitoring, reporting, and verification '
                                 '(MRV) system',
                       'context': 'reement. \n'
                                  '\n'
                                  'The project will finalize and launch a '
                                  'monitoring, reporting, and verification '
                                  '(MRV) system that will provide more '
                                  'accurate information and',
                       'document_id': 'a3056f62-5c73-4046-a46c-ad4d16c2595e',
                       'meta': {'PIMS_ID': 6211},
                       'offset_end': 102,
                       'offset_end_in_doc': 570,
                       'offset_start': 48,
                       'offset_start_in_doc': 516,
                       'probability': 0.7726760525124913,
                       'score': 9.787870407104492},
                   {   'answer': 'a uniform national wildlife PA system',
 

# Use Dense Passage Retriever 

In [45]:
'''
Dense Passage Retriever:

    Utilizes BERT to embed both the document and the query to compute a more contextual similarity score for ranking.
    
    Embedding of documents is computationally very expensive and is probably unfeasible without proper GPU support.
    
'''

from haystack.retriever.dense import DensePassageRetriever
retriever = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  use_gpu=True,
                                  embed_title=True,
                                  max_seq_len=256,
                                  batch_size=16,
                                  remove_sep_tok_from_untitled_passages=True)

10/07/2020 18:50:24 - INFO - filelock -   Lock 6070406640 acquired on /Users/jonas/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…

10/07/2020 18:50:25 - INFO - filelock -   Lock 6070406640 released on /Users/jonas/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock





10/07/2020 18:50:26 - INFO - filelock -   Lock 6057116864 acquired on /Users/jonas/.cache/torch/transformers/4b05580c0bfb2b640a50c1c6ae3fe9bca923871a29e0182927c086905d6c4c47.7652e92693c670fb8dfd7ec1f9191e3f82673742ff6a86cde9133a4ea6002ced.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=493.0, style=ProgressStyle(description_…

10/07/2020 18:50:27 - INFO - filelock -   Lock 6057116864 released on /Users/jonas/.cache/torch/transformers/4b05580c0bfb2b640a50c1c6ae3fe9bca923871a29e0182927c086905d6c4c47.7652e92693c670fb8dfd7ec1f9191e3f82673742ff6a86cde9133a4ea6002ced.lock





10/07/2020 18:50:27 - INFO - filelock -   Lock 6057589776 acquired on /Users/jonas/.cache/torch/transformers/8fdd0d2838c23f921379f2b0322aecf406cbdaa97ffecc544e3a1d49a7c302bd.6f90756c59007364d7842118056ad653f39f4d340fbe20bcc04037d2a45cb0f7.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=437986065.0, style=ProgressStyle(descri…

10/07/2020 18:53:10 - INFO - filelock -   Lock 6057589776 released on /Users/jonas/.cache/torch/transformers/8fdd0d2838c23f921379f2b0322aecf406cbdaa97ffecc544e3a1d49a7c302bd.6f90756c59007364d7842118056ad653f39f4d340fbe20bcc04037d2a45cb0f7.lock





Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-question_encoder-single-nq-base and are newly initialized: ['question_encoder.bert_model.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
10/07/2020 18:53:16 - INFO - filelock -   Lock 5006083408 acquired on /Users/jonas/.cache/torch/transformers/f6388f32b32eac5dad8f0f9c7009ce69e967c1b65ebae62f805fced8022ea991.9500f04f28d7c0ca5f9c265db7ba5030897a2d752451412827f7dec185b1ee36.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=492.0, style=ProgressStyle(description_…

10/07/2020 18:53:17 - INFO - filelock -   Lock 5006083408 released on /Users/jonas/.cache/torch/transformers/f6388f32b32eac5dad8f0f9c7009ce69e967c1b65ebae62f805fced8022ea991.9500f04f28d7c0ca5f9c265db7ba5030897a2d752451412827f7dec185b1ee36.lock





10/07/2020 18:53:17 - INFO - filelock -   Lock 5006083072 acquired on /Users/jonas/.cache/torch/transformers/d1c705617c02da7a616f4b5a8cb445a7f78e84bc4f9e26378c89901d97e16d78.232fed629becb590e5b2ac6c6124f9d1561ef7a1d17ad0394232dd46a0835002.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=437983985.0, style=ProgressStyle(descri…

10/07/2020 18:56:02 - INFO - filelock -   Lock 5006083072 released on /Users/jonas/.cache/torch/transformers/d1c705617c02da7a616f4b5a8cb445a7f78e84bc4f9e26378c89901d97e16d78.232fed629becb590e5b2ac6c6124f9d1561ef7a1d17ad0394232dd46a0835002.lock





Some weights of DPRContextEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['ctx_encoder.bert_model.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
#check for average length to see if max_seq_length of BERT model is sufficient:
mean_len = description.project_description.str.len().mean()
print(mean_len)
if mean_len > 512:
    print('sliding window or pre-processing has to enabled for proper results')

1444.4657039711192
sliding window or pre-processing has to enabled for proper results


In [22]:
#update embeddings - do not compile without GPU support. 
document_store.update_embeddings(retriever)

10/03/2020 11:57:29 - INFO - elasticsearch -   POST http://localhost:9200/document/_search?scroll=5m&size=1000 [status:200 request:0.525s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.137s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.121s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.084s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.113s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.104s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.091s]
10/03/2020 11:57:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.079s]
10/03/2020 11:57:30 - INFO - elasticsearch

10/03/2020 11:57:37 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.072s]
10/03/2020 11:57:37 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.074s]
10/03/2020 11:57:37 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.067s]
10/03/2020 11:57:37 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.069s]
10/03/2020 11:57:37 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.073s]
10/03/2020 11:57:37 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.073s]
10/03/2020 11:57:38 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.066s]
10/03/2020 11:57:38 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.084s]
10/03/2020 11:57:38 - INFO - elasticsearch -   POST http://local

10/03/2020 11:57:43 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.072s]
10/03/2020 11:57:43 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.082s]
10/03/2020 11:57:43 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.075s]
10/03/2020 11:57:43 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.082s]
10/03/2020 11:57:43 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.066s]
10/03/2020 11:57:44 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.067s]
10/03/2020 11:57:44 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.060s]
10/03/2020 11:57:44 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.056s]
10/03/2020 11:57:44 - INFO - elasticsearch -   POST http://local

KeyboardInterrupt: 

In [None]:
finder = Finder(reader, retriever)

In [None]:
prediction = finder.get_answers(question="What is the MRV system supporting in Serbia?", top_k_retriever=10, top_k_reader=5)