# BERT dev

작성일자: 210116 \
작성자: 조진욱\
목표: HF 기반 BERT 가 어떻게 돌아가는지 알아보자\
비고: 
1. 학습이 안되어있는 linear layer 을 사용하므로 성능은 안좋을것임
2. 원래 util 에 있는 함수들을 눈에 보이도록 코드상에 두었음. 앞으론 util 에서 불러올 예정


레퍼런스 코드
https://github.com/huggingface/transformers/blob/master/examples/legacy/question-answering/run_squad.py

가급적이면 수정사항을 적어두려고함(TO DO)

In [1]:
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
import os

In [2]:
import transformers
from transformers import (
    BertForQuestionAnswering,
    BertTokenizer,
)
from transformers.data.metrics.squad_metrics import (
    compute_predictions_logits,
    squad_evaluate,
)

from transformers.data.processors.squad import SquadResult, SquadProcessor, squad_convert_examples_to_features

In [3]:
import config as cfg

In [4]:
class SquadV1Processor(SquadProcessor):
    train_file = "train.json"
    dev_file = "dev.json"

In [5]:
def load_and_cache_examples(args, tokenizer, mode_or_filename, output_examples=False):
    """
    Changes
        1. no distributed training(removed for simplicity)
        2. no caching(cache make preprocessing time shorter, but removed for simplicity)
    """
    input_dir = args.squad_dir if args.squad_dir else args.data_dir

    print("Creating features from dataset file at %s", input_dir)

    if mode_or_filename == "train" or mode_or_filename == "dev" or mode_or_filename == "test":
        mode = mode_or_filename
        processor = SquadV1Processor()
        if mode == 'test':
            examples = processor.get_dev_examples(args.squad_dir, filename=processor.test_file)
        elif mode == 'dev':
            examples = processor.get_dev_examples(args.squad_dir, filename=processor.dev_file)
        else:
            examples = processor.get_train_examples(args.squad_dir, filename=processor.train_file)
    else:
        # odqa 에 사용되는 데이터들을 처리 하기 위한 용도. 여기선 필요없으므로 삭제
        pass

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=True if mode == 'train' else False,
        return_dataset='pt',
        threads=args.threads,
    )

#     torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset

In [6]:
mode = 'dev'

In [7]:
model = BertForQuestionAnswering.from_pretrained(cfg.model_name)
tokenizer = BertTokenizer.from_pretrained(cfg.tokenizer_name)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-large-cased and

In [9]:
model = model.to(cfg.device)

In [10]:
dataset, examples, features = load_and_cache_examples(cfg, tokenizer, mode='dev', output_examples=True)

  0%|          | 0/20 [00:00<?, ?it/s]

Creating features from dataset file at %s ./data


100%|██████████| 20/20 [00:01<00:00, 17.34it/s]
convert squad examples to features: 100%|██████████| 4639/4639 [00:06<00:00, 696.98it/s]
add example index and unique id: 100%|██████████| 4639/4639 [00:00<00:00, 1341425.46it/s]


In [11]:
eval_sampler = SequentialSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=cfg.eval_batch_size)

In [12]:
all_results = []

In [13]:
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    model.eval()
    batch = tuple(t.to(cfg.device) for t in batch)

    with torch.no_grad():
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "token_type_ids": batch[2],
        }

        feature_indices = batch[3]
#         print(feature_indices)
        outputs = model(**inputs)

    for i, feature_index in enumerate(feature_indices):
        eval_feature = features[feature_index.item()]
        unique_id = int(eval_feature.unique_id)

        start_logits = outputs.start_logits[i]
        end_logits = outputs.end_logits[i]
        result = SquadResult(unique_id, start_logits, end_logits)

        all_results.append(result)

Evaluating: 100%|██████████| 582/582 [00:59<00:00,  9.71it/s]


In [14]:
def print_instance_attributes(obj):
    for attribute, value in obj.__dict__.items():
        print(attribute, '=', value)

In [15]:
print_instance_attributes(outputs)

loss = None
start_logits = tensor([[-0.1530,  0.3589,  0.0857,  ...,  0.3976,  0.5834,  0.5727],
        [-0.1354,  0.4048,  0.2308,  ...,  0.4453,  0.6002,  0.5315],
        [-0.0866,  0.2840,  0.4452,  ...,  0.6871,  0.5968, -0.0071],
        [-0.0618,  0.2369, -0.9219,  ...,  0.6738,  0.8392,  0.9667],
        [-0.1048,  0.2697,  0.0457,  ...,  0.0706,  0.4150,  0.3221],
        [-0.1097,  0.1649, -0.9500,  ...,  0.8741,  0.2284,  0.4844]],
       device='cuda:0')
end_logits = tensor([[ 0.1823,  0.3414,  0.1204,  ..., -0.0666, -0.2827, -0.1326],
        [ 0.1997,  0.3687,  0.0925,  ..., -0.0860, -0.2994, -0.0506],
        [ 0.2192,  0.2996, -0.4997,  ..., -0.2378, -0.3344, -0.7197],
        [ 0.2411,  0.3736,  0.0279,  ..., -0.4288, -0.2715, -0.3000],
        [ 0.1903,  0.3309,  0.4197,  ..., -0.4447, -0.4841, -0.4865],
        [ 0.2125,  0.3412, -0.5083,  ...,  0.3277, -0.1746,  0.0188]],
       device='cuda:0')
hidden_states = None
attentions = None


In [16]:
outputs.start_logits.shape

torch.Size([6, 384])

In [18]:
predictions = compute_predictions_logits(
    examples,
    features,
    all_results,
    cfg.n_best_size,
    cfg.max_answer_length,
    True,
    None,
    None,
    None,
    cfg.verbose_logging,
    False,
    cfg.null_score_diff_threshold,
    tokenizer,
)

In [19]:
# Compute the F1 and exact scores.
results = squad_evaluate(examples, predictions)
print("Results: {}".format(results))

Results: OrderedDict([('exact', 0.04311273981461522), ('f1', 4.963176493011408), ('total', 4639), ('HasAns_exact', 0.04311273981461522), ('HasAns_f1', 4.963176493011408), ('HasAns_total', 4639), ('best_exact', 0.04311273981461522), ('best_exact_thresh', 0.0), ('best_f1', 4.963176493011408), ('best_f1_thresh', 0.0)])
