# TFIDF retrieval + bert(fine-tuned)

작성일자: 210118\
작성자: 조진욱\
목표: retrieval 모델과 우리가 학습시킨 bert 모델을 가지고 open domain qa 형식으로 만들어보자\
순서: 
1. 
query 가 나열되어있는 json 형태 파일 'dev_qa.json' 파일이 들어오면
retrieval 모델이 query에 맞는 context 하나를 찾아 (c, q, a) pair 를 만들어줌. 
이를 dev_cqa.json 로 저장함.
2. 
bert 모델은 dev_cqa.json 을 불러와 answer에 대한 답을 냄.
그 뒤 squad_evaluate 함수를 통해 점수 확인 


비고:
1. load 함수에서 json 형태로 불러오도록 되어있어 필요없지만 retrieval 의 결과를 json 으로 저장하는 과정을 거침. 
2. 1-2 에서 학습한 bert 모델을 그대로 가져다씀 from_pretrained(cfg.output_dir)
3. 그러나 1-2 best metric 보다 성능이 안나와야 정상. 왜냐하면 retrieval 과정에서 잘못된 context 들이 선택되었을 것이기 때문.

## Step1 retrieval 모델 불러와서 각 query 에 대한 document 찾기 

In [12]:
import config as cfg
from retrieval import SparseRetrieval
from utils import save_json
import os

In [13]:
mode = 'dev'

# context_file = f'{mode}_context.json'
# qa_file =  f'{mode}_qa.json'
# ret = SparseRetrieval(mode, data_path=cfg.squad_dir)

# # knowledge base에 있는 articles(context) 들의 정보를 임베딩해둠
# tfidfv, context_embeddings = ret.make_embedding(context_file)

# cqa_df = ret.retrieve(tfidfv, context_embeddings, qa_file)


In [14]:
# res_path = os.path.join(cfg.sparse_dir, f"retrieved_{mode}_sparse.csv")
# cqa_df.to_csv(res_path, sep='\t', index=False)
# import pandas as pd
# cqa_df = pd.read_csv(res_path, sep='\t', index_col=False)
# cqa_df.head()

Unnamed: 0,question,que_id,answers,context,context_id
0,Who is the creator of American Idol?,56d21eb1e7d4791d00902667,"{'answer_start': 67, 'text': 'Simon Fuller'}",['American Idol is broadcast to over 100 natio...,['context124']
1,What company produces American idol?,56d21eb1e7d4791d00902668,"{'answer_start': 96, 'text': '19 Entertainment'}",['American Idol is broadcast to over 100 natio...,['context124']
2,What year did American Idol begin airing?,56d21eb1e7d4791d00902669,"{'answer_start': 201, 'text': '2002'}",['American Idol is broadcast to over 100 natio...,['context124']
3,What British show is American Idols format bas...,56d21eb1e7d4791d0090266a,"{'answer_start': 270, 'text': 'Pop Idol'}",['American Idol is an American singing competi...,['context1']
4,What television network does American Idol air...,56d21eb1e7d4791d0090266b,"{'answer_start': 185, 'text': 'Fox'}","['In Latin America, the show is broadcast and ...",['context125']


In [15]:
result_filename = f"retrieved_{mode}_sparse.json"
# save_json(cqa_df, result_filename)

## Step2 bert 모델이 만들어진 json 파일을 다시 불러와 MRC 진행

torch Dataset의 형식으로 변환, mrc 형식으로 각 인스턴스에 대한 start end position 을 구한뒤 그에 대한 평가 진행 EM, F1_score

In [16]:
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange

In [17]:
import transformers
from transformers import (
    BertForQuestionAnswering,
    BertTokenizer,
)
# from transformers.data.metrics.squad_metrics import (
# from squad_metrics import(
    compute_predictions_logits,
    squad_evaluate,
)

from transformers.data.processors.squad import SquadResult, SquadProcessor, squad_convert_examples_to_features

In [18]:
import config as cfg
from utils import load_and_cache_examples

In [19]:
# BERT + 마지막 cls 추가 레이어 존재함
# 이미 학습된 모델이므로  Some weights of the model checkpoint at bert-large-cased were not used 와 같은 에러 발생하면 안됨
model = BertForQuestionAnswering.from_pretrained(cfg.output_dir)
tokenizer = BertTokenizer.from_pretrained(cfg.tokenizer_name)
model = model.to(cfg.device)

In [20]:
dataset, examples, features = load_and_cache_examples(cfg, tokenizer, mode_or_filename=result_filename, output_examples=True)
eval_sampler = SequentialSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=cfg.eval_batch_size)

  0%|          | 0/1 [00:00<?, ?it/s]

Creating features from dataset file at %s ./data/squad


100%|██████████| 1/1 [00:01<00:00,  1.22s/it]
convert squad examples to features: 100%|██████████| 4639/4639 [00:08<00:00, 565.17it/s]
add example index and unique id: 100%|██████████| 4639/4639 [00:00<00:00, 1267086.24it/s]


In [21]:
def evaluate(model, tokenizer):
    print("***** Running evaluation *****")
    print("  Num examples = ", len(dataset))
    print("  Batch size = ", cfg.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(cfg.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            feature_indices = batch[3]
            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            start_logits = outputs.start_logits[i]
            end_logits = outputs.end_logits[i]
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)
            
    predictions = compute_predictions_logits(examples,
                                            features,
                                            all_results,
                                            cfg.n_best_size,
                                            cfg.max_answer_length,
                                            False, # do_lower_case
                                            None,  
                                            None,
#                                             None,  # 
                                            cfg.verbose_logging,
#                                             False, #
                                            cfg.null_score_diff_threshold,
                                            tokenizer,)
    
    results = squad_evaluate(examples, predictions)
    return results

In [22]:
results = evaluate(model, tokenizer)

Evaluating:   1%|          | 3/583 [00:00<00:21, 27.15it/s]

***** Running evaluation *****
  Num examples =  4661
  Batch size =  8


Evaluating: 100%|██████████| 583/583 [00:20<00:00, 27.79it/s]


TypeError: string indices must be integers

In [None]:
for key, value in results.items():
    print("eval_{}: {}".format(key, value))