In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import numpy as np
import pandas as pd
import argparse
import json
import os
from collections import OrderedDict
import torch
import csv
import util
from transformers import DistilBertTokenizerFast, AutoTokenizer
from transformers import DistilBertForQuestionAnswering, AutoModelForQuestionAnswering
from transformers import AdamW
from tensorboardX import SummaryWriter

from torch.utils.data import DataLoader
from torch.utils.data.sampler import RandomSampler, SequentialSampler
from args import get_train_test_args
from train import prepare_eval_data, prepare_train_data
from util import compute_f1, compute_em

from tqdm import tqdm

  from IPython.core.display import display, HTML


### 함수 정의

In [2]:
def read_and_process(tokenizer, dataset_dict, dir_name, dataset_name, split):
    #TODO: cache this if possible
    cache_path = f'{dir_name}/{dataset_name}_encodings.pt'
    if os.path.exists(cache_path) and not True:
        tokenized_examples = util.load_pickle(cache_path)
    else:
        if split=='train':
            tokenized_examples = prepare_train_data(dataset_dict, tokenizer)
        else:
            tokenized_examples = prepare_eval_data(dataset_dict, tokenizer)
        util.save_pickle(tokenized_examples, cache_path)
    return tokenized_examples

def get_dataset(datasets, data_dir, tokenizer, split_name):
    datasets = datasets.split(',')
    dataset_dict = None
    dataset_name=''
    for dataset in datasets:
        dataset_name += f'_{dataset}'
        dataset_dict_curr = util.read_squad(f'{data_dir}/{dataset}')
        dataset_dict = util.merge(dataset_dict, dataset_dict_curr)
    data_encodings = read_and_process(tokenizer, dataset_dict, data_dir, dataset_name, split_name)
    return util.QADataset(data_encodings, train=(split_name=='train')), dataset_dict

def evaluate(model, data_loader, data_dict, return_preds=False, split='validation'):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    model.eval()
    pred_dict = {}
    all_start_logits = []
    all_end_logits = []
    with torch.no_grad(), \
            tqdm(total=len(data_loader.dataset)) as progress_bar:
        for batch in data_loader:
            # Setup for forward
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            batch_size = len(input_ids)
            outputs = model(input_ids, attention_mask=attention_mask)
            # Forward
            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            # TODO: compute loss

            all_start_logits.append(start_logits)
            all_end_logits.append(end_logits)
            progress_bar.update(batch_size)

    # Get F1 and EM scores
    start_logits = torch.cat(all_start_logits).cpu().numpy()
    end_logits = torch.cat(all_end_logits).cpu().numpy()
    preds = util.postprocess_qa_predictions(data_dict,
                                             data_loader.dataset.encodings,
                                             (start_logits, end_logits))
    preds = util.postprocess_qa_predictions(data_dict,
                                                 data_loader.dataset.encodings,
                                                 (start_logits, end_logits))
    if split == 'validation':
        results = util.eval_dicts(data_dict, preds)
        results_list = [('F1', results['F1']),
                        ('EM', results['EM'])]
    else:
        results_list = [('F1', -1.0),
                        ('EM', -1.0)]
    results = OrderedDict(results_list)
    if return_preds:
        return preds, results
    return results

In [3]:
val_dir = 'datasets/indomain_val'
val_datasets = 'squad,nat_questions,newsqa'

eval_dir = 'datasets/oodomain_val'
eval_datasets = 'race,relation_extraction,duorc'

save_dir = 'save/'
sub_file = 'mtl_submission_val.csv'

model_dir = 'save/'

batch_size = 16

In [4]:
model = AutoModelForQuestionAnswering.from_pretrained("deepset/tinybert-6l-768d-squad2")
tokenizer = AutoTokenizer.from_pretrained("deepset/tinybert-6l-768d-squad2")

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

split_name = 'test' if 'test' in eval_dir else 'validation'

log = util.get_logger(save_dir, f'log_{split_name}')

checkpoint_path = os.path.join(model_dir, 'checkpoint')

model = AutoModelForQuestionAnswering.from_pretrained(checkpoint_path)

model.to(device)

eval_dataset, eval_dict = get_dataset(eval_datasets, eval_dir, tokenizer, split_name)

eval_loader = DataLoader(eval_dataset,batch_size=batch_size,sampler=SequentialSampler(eval_dataset))

eval_preds, eval_scores = evaluate(model, eval_loader,eval_dict, return_preds=True, split=split_name)

100%|█████████████████████████████████████████████████████████████████████████████| 721/721 [00:00<00:00, 27724.13it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 721/721 [00:03<00:00, 188.43it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 382/382 [00:00<00:00, 2364.96it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 382/382 [00:00<00:00, 2343.05it/s]


In [5]:
ques_dict = {k:v for k,v in zip(eval_dict['id'], eval_dict['question'])}
cont_dict = {k:v for k,v in zip(eval_dict['id'], eval_dict['context'])}
answ_dict = {k:v for k,v in zip(eval_dict['id'], eval_dict['answer'])}

f1s = []
ems = []
uuids = []

for uuid in sorted(eval_preds):
    
    label = answ_dict[uuid]['text'][0]
    pred = eval_preds[uuid]
    
    f1 = compute_f1(label, pred)
    em = compute_em(label, pred)
    
    f1s.append(f1)
    ems.append(em)
    uuids.append(uuid)
    
eval_scores_2 = {k:[v1,v2] for k, v1, v2 in zip(uuids, f1s, ems)}

In [6]:
results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items())
log.info(f'Eval {results_str}')
# Write submission file
sub_path = os.path.join(save_dir, split_name + '_' + sub_file)
log.info(f'Writing submission file to {sub_path}...')
with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
    csv_writer = csv.writer(csv_fh, delimiter=',')
    csv_writer.writerow(['Id', 'question', 'context', 'answer', 'Predicted', 'F1', 'EM'])
    for uuid in sorted(eval_preds):
        csv_writer.writerow([uuid, ques_dict[uuid], cont_dict[uuid], answ_dict[uuid]['text'][0], eval_preds[uuid], eval_scores_2[uuid][0],  eval_scores_2[uuid][1]])

[11.24.22 16:00:55] Eval F1: 50.66, EM: 35.34
[11.24.22 16:00:55] Writing submission file to save/02.finetune/tinybert-only_classifier/baseline-01/validation_mtl_submission_val.csv...


In [7]:
eval_scores

OrderedDict([('F1', 50.661239008093595), ('EM', 35.340314136125656)])