# Dense retrieval + bert(rerank)

작성일자: 2101\
작성자: 조진욱\
목표: dpr 같은 모델을 만들자
순서: 
3-1 과 동일

비고:
1. load 함수에서 json 형태로 불러오도록 되어있어 필요없지만 retrieval 의 결과를 json 으로 저장하는 과정을 거침. 
2. reader(mrc) 모델은 encoder 모델에서 topk 를 뽑은 뒤 합쳐서 rerank 만 도입한 것. 구현하기 쉬우나 학습하는데 시간이 걸려서 일단 사전 훈련된 모델을 가져옴 
3. 그러나 1-2 best metric 보다 성능이 안나와야 정상. 왜냐하면 retrieval 과정에서 잘못된 context 들이 선택되었을 것이기 때문.

## Step1 retrieval 모델 불러와서 각 query 에 대한 document 찾기 
현재 dense retrieval은 기본값으로 하나의 query 당 5개의 doc을 불러오도록 해뒀으나, 우리는 mrc 형식으로 best 1개 document 만을 선택하도록 할 것이므로 n_doc 값을 1로 조정해야함

In [6]:
import os
from retrieval import DenseRetrieval
from utils import save_json
import config as cfg

In [7]:
mode = 'dev'

In [16]:
model_name = 'facebook/dpr-question_encoder-single-nq-base'
qas_file =  f'{mode}_qa.json'
passages_path = os.path.join(cfg.dense_dir, "dpr_dataset")
index_path = os.path.join(cfg.dense_dir, "dpr_dataset_hnsw_index.faiss")
ret_ds = DenseRetrieval.from_pretrained(model_name,
                                        passages_path=passages_path,
                                        index_path=index_path,
                                       mode=mode)
cqa_df = ret_ds.retrieve(qas_file)
res_path = os.path.join(cfg.dense_dir, f"retrieved_{mode}_dense.csv")

cqa_df.to_csv(res_path, sep='\t', index=False)

searching indexed dataset
passages and index is exist


HBox(children=(FloatProgress(value=0.0, max=4639.0), HTML(value='')))




In [17]:
import pandas as pd
cqa_df = pd.read_csv(res_path, sep='\t', index_col=False)
cqa_df.head()

Unnamed: 0,question,que_id,answers,context,context_id
0,Who is the creator of American Idol?,56d21eb1e7d4791d00902667,"{'answer_start': 67, 'text': 'Simon Fuller'}",American Idol was based on the British show Po...,[6]
1,What company produces American idol?,56d21eb1e7d4791d00902668,"{'answer_start': 96, 'text': '19 Entertainment'}","19 Recordings, a recording label owned by 19 E...",[172]
2,What year did American Idol begin airing?,56d21eb1e7d4791d00902669,"{'answer_start': 201, 'text': '2002'}",American Idol is an American singing competiti...,[1]
3,What British show is American Idols format bas...,56d21eb1e7d4791d0090266a,"{'answer_start': 270, 'text': 'Pop Idol'}",American Idol was based on the British show Po...,[6]
4,What television network does American Idol air...,56d21eb1e7d4791d0090266b,"{'answer_start': 185, 'text': 'Fox'}",American Idol is broadcast to over 100 nations...,[176]


In [18]:
result_filename = f"retrieved_{mode}_dense.json"
save_json(cqa_df, result_filename)

filename: ./data/dense/retrieved_dev_dense.json


## Step2 bert 모델이 만들어진 json 파일을 다시 불러와 MRC 진행

torch Dataset의 형식으로 변환, mrc 형식으로 각 인스턴스에 대한 start end position 을 구한뒤 그에 대한 평가 진행 EM, F1_score

In [19]:
from transformers import DPRReader, DPRReaderTokenizer
tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
model = model.to(cfg.device)

In [20]:
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
from utils import load_and_cache_examples

In [21]:
# from transformers.data.metrics.squad_metrics import (
    
from squad_metrics import(
    compute_predictions_logits,
    squad_evaluate,
)



from transformers.data.processors.squad import SquadResult, SquadProcessor, squad_convert_examples_to_features

In [22]:
dataset, examples, features = load_and_cache_examples(cfg, tokenizer, mode_or_filename=result_filename, output_examples=True)
eval_sampler = SequentialSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=cfg.eval_batch_size)

  0%|          | 0/1 [00:00<?, ?it/s]

Creating features from dataset file at %s ./data/squad


100%|██████████| 1/1 [00:00<00:00,  1.14it/s]
convert squad examples to features: 100%|██████████| 4639/4639 [00:07<00:00, 661.31it/s]
add example index and unique id: 100%|██████████| 4639/4639 [00:00<00:00, 1314598.76it/s]


In [23]:
def evaluate(model, tokenizer):
    print("***** Running evaluation *****")
    print("  Num examples = ", len(dataset))
    print("  Batch size = ", cfg.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(cfg.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
#                 "token_type_ids": batch[2],
            }

            feature_indices = batch[3]
            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            start_logits = outputs.start_logits[i]
            end_logits = outputs.end_logits[i]
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)
            
    predictions = compute_predictions_logits(examples,
                                            features,
                                            all_results,
                                            cfg.n_best_size,
                                            cfg.max_answer_length,
                                            True,
                                            None,
                                            None,
                                            None,
                                            cfg.verbose_logging,
                                            False,
                                            cfg.null_score_diff_threshold,
                                            tokenizer,)
    results = squad_evaluate(examples, predictions)
    return results

In [24]:
results = evaluate(model, tokenizer)

Evaluating:   0%|          | 0/580 [00:00<?, ?it/s]

***** Running evaluation *****
  Num examples =  4639
  Batch size =  8


Evaluating: 100%|██████████| 580/580 [00:20<00:00, 28.73it/s]


TypeError: compute_predictions_logits() takes 11 positional arguments but 13 were given

In [None]:
for key, value in results.items():
    print("eval_{}: {}".format(key, value))