In [1]:
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
import os

In [2]:
import transformers
from transformers import (
    BertForQuestionAnswering,
    BertTokenizer,
)
from transformers.data.metrics.squad_metrics import (
    compute_predictions_logits,
    squad_evaluate,
)

from transformers.data.processors.squad import SquadResult, SquadProcessor, squad_convert_examples_to_features

In [3]:
import config as cfg

In [4]:
class SquadV1Processor(SquadProcessor):
    train_file = "train.json"
    dev_file = "dev.json"

In [5]:
def load_and_cache_examples(args, tokenizer, mode, evaluate=False, output_examples=False):
    """
    Reference: https://github.com/huggingface/transformers/blob/master/examples/legacy/question-answering/run_squad.py
    
    Changes 
        1. no distributed training(removed for simplicity)
        2. no caching(cache make preprocessing time shorter, but removed for simplicity)
    
    """
    input_dir = args.data_dir if args.data_dir else "."

    print("Creating features from dataset file at %s", input_dir)
    processor = SquadV1Processor()
    if mode == 'test':
        examples = processor.get_dev_examples(args.data_dir, filename=processor.test_file)
    elif mode == 'dev':
        examples = processor.get_dev_examples(args.data_dir, filename=processor.dev_file)
    else:
        examples = processor.get_train_examples(args.data_dir, filename=processor.train_file)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=True if mode == 'train' else False,
        return_dataset='pt',
        threads=args.threads,
    )

#     torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset

In [6]:
mode = 'train'

In [7]:
# BERT + 마지막 cls 추가 레이어 존재함
# 추가 레이어는 학습이 되어있지 않으므로, 아래 Some weights of the model checkpoint at bert-large-cased were not used 와 같은 에러 발생
# 추후 과제로 낼 시 이 부분을 각자 customize 하도록 과제를 내도 좋을듯 함
model = BertForQuestionAnswering.from_pretrained(cfg.model_name)
tokenizer = BertTokenizer.from_pretrained(cfg.tokenizer_name)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-large-cased and

In [8]:
model = model.to(cfg.device)

In [None]:
train_dataset = load_and_cache_examples(cfg, tokenizer, mode=mode, output_examples=False)

  0%|          | 0/200 [00:00<?, ?it/s]

Creating features from dataset file at %s ./data


100%|██████████| 200/200 [00:09<00:00, 21.59it/s]
convert squad examples to features:  35%|███▍      | 13377/38708 [00:16<00:30, 819.10it/s]

In [None]:
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=cfg.train_batch_size)

t_total = len(train_dataloader) // cfg.gradient_accumulation_steps * cfg.num_train_epochs

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": cfg.weight_decay,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.learning_rate, eps=cfg.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=cfg.warmup_steps, num_training_steps=t_total
)

# Train!
print("***** Running training *****")
print("  Num examples = %d", len(train_dataset))
print("  Num Epochs = %d", cfg.num_train_epochs)
print(
    "  Total train batch size = %d",
    cfg.train_batch_size
    * cfg.gradient_accumulation_steps
)
print("  Gradient Accumulation steps = %d", cfg.gradient_accumulation_steps)
print("  Total optimization steps = %d", t_total)

global_step = 1
tr_loss = 0.0
best_metrics = {'f1': 0, 'exact': 0, 'epoch': -1}
model.zero_grad()
# Added here for reproductibility
set_seed(cfg.seed)

In [None]:
for now_epoch in trange(int(cfg.num_train_epochs), desc="Epoch"):
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(epoch_iterator):

        model.train()
        batch = tuple(t.to(cfg.device) for t in batch)

        print("")

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "token_type_ids": batch[2],
            "start_positions": batch[3],
            "end_positions": batch[4],
            "device": cfg.device,
        }

        outputs = model(**inputs)
        loss = outputs[0]

        if cfg.gradient_accumulation_steps > 1:
            loss = loss / cfg.gradient_accumulation_steps

        loss.backward()

        tr_loss += loss.item()
        if (step + 1) % cfg.gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)

            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

    results = evaluate(cfg, model, tokenizer, 'dev')

    if best_metrics['f1'] < results['f1']:
        best_metrics['f1'] = results['f1']
        best_metrics['exact'] = results['exact']
        best_metrics['epoch'] = now_epoch
        model.save_pretrained(cfg.output_dir)

    for key, value in results.items():
        print("dev eval_{}: {}".format(key, value))

    for key, value in best_metrics.items():
        print("dev best eval_{}: {}".format(key, value))

In [None]:
all_results = []

In [None]:
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    model.eval()
    batch = tuple(t.to(cfg.device) for t in batch)

    with torch.no_grad():
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "token_type_ids": batch[2],
        }

        feature_indices = batch[3]
#         print(feature_indices)
        outputs = model(**inputs)

    for i, feature_index in enumerate(feature_indices):
        eval_feature = features[feature_index.item()]
        unique_id = int(eval_feature.unique_id)

        start_logits = outputs.start_logits[i]
        end_logits = outputs.end_logits[i]
        result = SquadResult(unique_id, start_logits, end_logits)

        all_results.append(result)

In [None]:
def print_instance_attributes(obj):
    for attribute, value in obj.__dict__.items():
        print(attribute, '=', value)

In [None]:
print_instance_attributes(outputs)

In [None]:
outputs.start_logits.shape

In [None]:
output_prediction_file = "predictions_{}.json".format(mode)
output_nbest_file = "nbest_predictions_{}.json".format(mode)
output_null_log_odds_file = "null_log_odds_predictions_{}.json".format(mode)
predictions = compute_predictions_logits(
    examples,
    features,
    all_results,
    cfg.n_best_size,
    cfg.max_answer_length,
    True,
    None,
    None,
    None,
    cfg.verbose_logging,
    False,
    cfg.null_score_diff_threshold,
    tokenizer,
)

In [None]:
# Compute the F1 and exact scores.
results = squad_evaluate(examples, predictions)
print("Results: {}".format(results))