# TFIDF retrieval + mrc(pretrained)

작성일자: 210118\
작성자: 조진욱\
목표: retrieval 모델과 우리가 학습시킨 bert 모델을 가지고 open domain qa 형식으로 만들어보자\
순서: 
1. 
query 가 나열되어있는 json 형태 파일 'dev_qa.json' 파일이 들어오면
retrieval 모델이 query에 맞는 context 하나를 찾아 (c, q, a) pair 를 만들어줌. 
이를 dev_cqa.json 로 저장함.
2. 
bert 모델은 dev_cqa.json 을 불러와 answer에 대한 답을 냄.
그 뒤 squad_evaluate 함수를 통해 점수 확인 


비고:
1. load 함수에서 json 형태로 불러오도록 되어있어 필요없지만 retrieval 의 결과를 json 으로 저장하는 과정을 거침. 
2. 1-2 에서 학습한 bert 모델을 그대로 가져다씀 from_pretrained(cfg.output_dir)
3. 그러나 1-2 best metric 보다 성능이 안나와야 정상. 왜냐하면 retrieval 과정에서 잘못된 context 들이 선택되었을 것이기 때문.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy import spatial
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

from tqdm.notebook import trange, tqdm
import pandas as pd
import pickle
from utils import read_file, save_json

In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [16]:
class Retrieval(object):
    def __init__(self, mode, data_path=None):
        if not data_path:
            self.data_path = './data/'
        else:
            self.data_path = data_path
        self.mode = mode

    def make_embedding(self, context_name):
        # Pickle save.
        pickle_name = "sparse"+ "_" + self.mode + "_embedding.bin"
        pickle_path = os.path.join(self.data_path + pickle_name)

        context_dict = read_file(self.data_path + context_name)
        context = context_dict['text']

        tfidfv = TfidfVectorizer(tokenizer=self.tokenize, ngram_range=(1,2)).fit(context)

        if os.path.isfile(pickle_path):
            with open(pickle_path, "rb") as file:
                context_embeddings = pickle.load(file)
            print("Embedding pickle load.")
        else:
            context_embeddings = tfidfv.transform(context).toarray()
            # Pickle save.
            with open(pickle_path, "wb") as file:
                pickle.dump(context_embeddings, file)
            print("Embedding pickle saved.")
        return tfidfv, context_embeddings


    def retrieval(self, model, sparse_embedding, file_name, topk=1):
        # load qas file.
        qas = json.load(open(self.data_path + file_name))['data']
        
        context_name = self.mode + "_context.json"
        context_dict = read_file(self.data_path + context_name)
        context = context_dict['text']
        context_id = context_dict['ids']

        cqas = pd.DataFrame()
        que = []
        que_id = []
        con_id = []
        con = []

        for item in tqdm(qas):
            query = item['question']
            query_id = item['id']

            query_s_embedding = model.transform([query]).toarray()
            predict_dict = self.sparse_searching(query_s_embedding,
                                                 sparse_embedding,
                                                 context,
                                                 topk)
            que.append(query)
            que_id.append(query_id)
        
            for i in range(len(predict_dict['text'])):
                temp_context = predict_dict['text'][i]
                con.append(temp_context)
                con_id.append(context_id[predict_dict['ids'][i]])

        cqas['question'] = que
        cqas['q_id'] = que_id
        cqas['context'] = con
        cqas['context_id'] = con_id

        return cqas

    def sparse_searching(self, sparse_query, sparse_embedding, texts, topk=1):
        distances = spatial.distance.cdist(sparse_query, sparse_embedding, "cosine")[0]
        result = zip(range(len(distances)), distances)
        result = sorted(result, key=lambda x: x[1])
        
        cand_dict = {}
        candidate = []
        cand_ids = []
        for idx, distances in result[0:topk]: # top k 
            candidate.append(texts[idx])
            cand_ids.append(idx)

        cand_dict['text'] = candidate
        cand_dict['ids'] = cand_ids
        return cand_dict

    def tokenize(self, text):
        stemmer = PorterStemmer()
        tokens = [word for word in word_tokenize(text)]
        stems = [stemmer.stem(item) for item in tokens]
        return stems

In [6]:
mode = 'dev'

In [17]:
context_file = f'{mode}_context.json'
qa_file =  f'{mode}_qa.json'
ret = Retrieval(mode)

In [18]:
# knowledge base에 있는 articles(context) 들의 정보를 임베딩해둠
tfidfv, context_embeddings = ret.make_embedding(context_file)

Embedding pickle load.


In [19]:
cqa_df = ret.retrieval(tfidfv, context_embeddings, qas_file)
cqa_df.to_csv(f"retrived_{mode}.csv")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=204.0), HTML(value='')))




In [None]:
save_json(cqd_df, mode)

In [1]:
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
import os

In [2]:
import transformers
from transformers import (
    BertForQuestionAnswering,
    BertTokenizer,
)
from transformers.data.metrics.squad_metrics import (
    compute_predictions_logits,
    squad_evaluate,
)

from transformers.data.processors.squad import SquadResult, SquadProcessor, squad_convert_examples_to_features

In [3]:
import config as cfg

In [7]:
# BERT + 마지막 cls 추가 레이어 존재함
# 이미 학습된 모델이므로  Some weights of the model checkpoint at bert-large-cased were not used 와 같은 에러 발생하면 안됨
model = BertForQuestionAnswering.from_pretrained(cfg.output_dir)
tokenizer = BertTokenizer.from_pretrained(cfg.tokenizer_name)
model = model.to(cfg.device)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-large-cased and

In [13]:
dataset, examples, features = load_and_cache_examples(cfg, tokenizer, mode=mode, output_examples=True)
eval_sampler = SequentialSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=cfg.eval_batch_size)

  5%|▌         | 1/20 [00:00<00:02,  6.71it/s]

Creating features from dataset file at %s ./data


100%|██████████| 20/20 [00:01<00:00, 18.75it/s]
convert squad examples to features: 100%|██████████| 4639/4639 [00:06<00:00, 693.77it/s]
add example index and unique id: 100%|██████████| 4639/4639 [00:00<00:00, 1421180.06it/s]


In [None]:
def evaluate(model, tokenizer):
    print("***** Running evaluation *****")
    print("  Num examples = ", len(dataset))
    print("  Batch size = ", cfg.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(cfg.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            feature_indices = batch[3]
            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            start_logits = outputs.start_logits[i]
            end_logits = outputs.end_logits[i]
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)
            
    predictions = compute_predictions_logits(examples,
                                            features,
                                            all_results,
                                            cfg.n_best_size,
                                            cfg.max_answer_length,
                                            True,
                                            None,
                                            None,
                                            None,
                                            cfg.verbose_logging,
                                            False,
                                            cfg.null_score_diff_threshold,
                                            tokenizer,)
    results = squad_evaluate(examples, predictions)
    return results
    

In [None]:
results = evaluate(model, tokenizer)

for key, value in results.items():
    print(f"{mode} eval_{}: {}".format(key, value))