In [1]:
import dataclasses
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, Optional

import numpy as np

In [2]:
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed )

In [22]:
from utils_superglue_record import (
    SuperGlueDataset, 
    superglue_compute_metrics, 
    superglue_output_modes, 
    superglue_tasks_num_labels,
    processors,
    Split
)

In [4]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(processors.keys())})
    data_dir: str = field(metadata={"help": "Should contain the data files for the task."})
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    
    def __post_init__(self):
        self.task_name = self.task_name.lower()        

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
logger = logging.getLogger(__name__)

MODEL_NAME = "bert-base-cased"
DATESTAMP = "20200811"
SUPER_GLUE_DIR = "/home/keyur/medhas/superglue_data/"
TASK_NAME = "ReCoRD"
PER_DEVICE_BATCH_SIZE = 24
EXPERIMENT_DIR="/mnt/data/medhas/glue_experiments/%s/%s"%(MODEL_NAME, DATESTAMP)

custom_sysargv = [
"--model_name_or_path=%s"%MODEL_NAME,
"--task_name=%s"%TASK_NAME,
"--do_train",
"--do_eval",
"--data_dir=%s"%os.path.join(SUPER_GLUE_DIR, TASK_NAME),
"--max_seq_length=512",
"--per_device_train_batch_size=%s"%PER_DEVICE_BATCH_SIZE,
"--learning_rate=1e-5",
"--num_train_epochs=10",
"--output_dir=%s"%os.path.join(EXPERIMENT_DIR, TASK_NAME),
"--logging_dir=%s/logs"%os.path.join(EXPERIMENT_DIR, TASK_NAME),
"--logging_steps=14270",
"--evaluate_during_training",
"--eval_step=14270",
"--save_total_limit=2",
"--save_steps=14270",
"--gradient_accumulation_steps=1",
"--overwrite_output_dir"
]

model_args, data_args, training_args = parser.parse_args_into_dataclasses(args=custom_sysargv)

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.WARN if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)

set_seed(training_args.seed)
training_args.seed

num_labels = superglue_tasks_num_labels[data_args.task_name]
output_mode = superglue_output_modes[data_args.task_name]
print ("Task:", data_args.task_name, "Labels:", num_labels, ', Output', output_mode)




Task: record Labels: 2 , Output classification


In [5]:
config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else     model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
)
#model = BertForNLI.from_pretrained(model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir)

- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_dataset = SuperGlueDataset(data_args.data_dir, tokenizer=tokenizer, 
                    task=data_args.task_name, max_seq_length=data_args.max_seq_length, 
                    overwrite_cache=data_args.overwrite_cache, mode=Split.train,) if training_args.do_train else None

eval_dataset = SuperGlueDataset(data_args.data_dir, tokenizer=tokenizer, 
                    task=data_args.task_name, max_seq_length=data_args.max_seq_length, 
                    overwrite_cache=data_args.overwrite_cache, mode=Split.dev,) if training_args.do_eval else None

test_dataset = SuperGlueDataset(data_args.data_dir, tokenizer=tokenizer, 
                    task=data_args.task_name, max_seq_length=data_args.max_seq_length, 
                    overwrite_cache=data_args.overwrite_cache, mode=Split.test,) if training_args.do_predict else None

In [9]:
def compute_metrics(p: EvalPrediction) -> Dict:
    guids = list(map(lambda x: x.guid, eval_dataset.features))
    answers = eval_dataset.answers["dev"]
    return superglue_compute_metrics(data_args.task_name, p.predictions, p.label_ids, guids=guids, answers=answers)

In [10]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [11]:
if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=14270.0, style=ProgressStyle(description_…



{'loss': 0.3274291346503582, 'learning_rate': 9e-06, 'epoch': 1.0, 'step': 14270}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=4655.0, style=ProgressStyle(description_…


{'eval_loss': 0.3065841975156718, 'eval_f1': 0.6803904044611042, 'eval_em': 0.6758989310009719, 'eval_em_and_f1': 0.6781446677310381, 'epoch': 1.0, 'step': 14270}





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=14270.0, style=ProgressStyle(description_…

{'loss': 0.23146719710735608, 'learning_rate': 8.000000000000001e-06, 'epoch': 2.0, 'step': 28540}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=4655.0, style=ProgressStyle(description_…


{'eval_loss': 0.3262623070650179, 'eval_f1': 0.6753757115090934, 'eval_em': 0.6699465500485908, 'eval_em_and_f1': 0.6726611307788422, 'epoch': 2.0, 'step': 28540}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=14270.0, style=ProgressStyle(description_…





KeyboardInterrupt: 

In [12]:
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch

eval_dataloader = trainer.get_eval_dataloader(eval_dataset)
model = trainer.model
batch_size = eval_dataloader.batch_size
eval_losses: List[float] = []
preds: torch.Tensor = None
label_ids: torch.Tensor = None
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [13]:
from tqdm.auto import tqdm, trange

for inputs in tqdm(eval_dataloader, desc="evaluating"):
    loss, logits, labels = trainer.prediction_step(model, inputs, trainer.prediction_loss_only)
    if loss is not None:
        eval_losses.append(loss)
    if logits is not None:
        preds = logits if preds is None else torch.cat((preds, logits), dim=0)
    if labels is not None:
        label_ids = labels if label_ids is None else torch.cat((label_ids, labels), dim=0)

HBox(children=(FloatProgress(value=0.0, description='evaluating', max=4655.0, style=ProgressStyle(description_…




In [None]:
trainer.prediction_loss_only

In [14]:
if preds is not None:
    preds = preds.cpu().numpy()
if label_ids is not None:
    label_ids = label_ids.cpu().numpy()

In [None]:
trainer.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))

In [15]:
guids = list(map(lambda x: x.guid, eval_dataset.features))
answers = eval_dataset.answers["dev"]
superglue_compute_metrics(data_args.task_name, preds, label_ids, guids=guids, answers=answers)

{'f1': 0.6844468739876902,
 'em': 0.6790573372206026,
 'em_and_f1': 0.6817521056041465}

In [None]:
prev_record_id = ""
prev_question_id = ""
counter = -1
qgroup = []
for key in guids:
    _, record_id, question_id, _ = key.split("-")
    if (prev_record_id == record_id) & (prev_question_id == question_id):
        qgroup.append(counter)
        continue
    else:
        prev_record_id = record_id
        prev_question_id = question_id
        counter += 1
        qgroup.append(counter)


In [None]:
preds[np.array(qgroup)==0]

In [None]:
label_ids[np.array(qgroup)==0]

In [None]:
label_ids.shape

In [None]:
preds1 = np.argmax(preds, axis=1)
guids = list(map(lambda x: x.guid, eval_dataset.features))
answers = eval_dataset.answers['dev']

In [None]:
from collections import Counter, defaultdict
qst2ans = defaultdict(list)

In [None]:
for idx, pred, label in zip(guids, preds, labels):
    qst_idx = (idx[0], idx[1])
    qst2ans[qst_idx].append((idx[2], pred))

In [None]:
import re
import string

def normalize_answer(s):
        """Lower text and remove punctuation, articles and extra whitespace.
        From official ReCoRD eval script """
        def remove_articles(text):
            return re.sub(r"\b(a|an|the)\b", " ", text)
        def white_space_fix(text):
            return " ".join(text.split())
        def remove_punc(text):
            exclude = set(string.punctuation)
            return "".join(ch for ch in text if ch not in exclude)
        def lower(text):
            return text.lower()
        return white_space_fix(remove_articles(remove_punc(lower(s))))

def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    """ Compute max metric between prediction and each ground truth.
    From official ReCoRD eval script """
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def _record_f1_score(prediction, ground_truth):
    """ Compute normalized token level F1
    From official ReCoRD eval script """
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def _record_em_score(prediction, ground_truth):
    """ Compute normalized exact match
    From official ReCoRD eval script """
    return normalize_answer(prediction) == normalize_answer(ground_truth)


In [None]:
f1s, ems = [], []
for qst, idxs_and_prds in qst2ans.items():
    cands, golds = answers[qst]
    idxs_and_prds.sort(key=lambda x: x[0])
    logits = np.vstack([i[1] for i in idxs_and_prds])
    # take the most probable choice as the prediction
    pred_idx = softmax(logits, axis=1)[:, -1].argmax().item()
    pred = cands[pred_idx]
    # compute metrics
    f1 = metric_max_over_ground_truths(_record_f1_score, pred, golds)
    em = metric_max_over_ground_truths(_record_em_score, pred, golds)
    f1s.append(f1)
    ems.append(em)

In [None]:
avg_f1 = sum(f1s) / len(f1s)
avg_em = sum(ems) / len(ems)
em_and_f1 = (avg_em + avg_f1) / 2

In [None]:
{"f1": avg_f1, "em": avg_em, "em_and_f1": em_and_f1}

In [17]:
guids

[[2, 2, 0],
 [2, 2, 1],
 [2, 2, 2],
 [2, 2, 3],
 [2, 2, 4],
 [2, 2, 5],
 [2, 2, 6],
 [2, 2, 7],
 [2, 2, 8],
 [2, 2, 9],
 [2, 2, 10],
 [3, 3, 0],
 [3, 3, 1],
 [3, 3, 2],
 [3, 3, 3],
 [3, 3, 4],
 [3, 3, 5],
 [3, 3, 6],
 [4, 4, 0],
 [4, 4, 1],
 [4, 4, 2],
 [4, 4, 3],
 [4, 4, 4],
 [4, 4, 5],
 [4, 4, 6],
 [4, 4, 7],
 [4, 4, 8],
 [4, 4, 9],
 [4, 4, 10],
 [4, 4, 11],
 [4, 4, 12],
 [4, 5, 0],
 [4, 5, 1],
 [4, 5, 2],
 [4, 5, 3],
 [4, 5, 4],
 [4, 5, 5],
 [4, 5, 6],
 [4, 5, 7],
 [4, 5, 8],
 [4, 5, 9],
 [4, 5, 10],
 [4, 5, 11],
 [4, 5, 12],
 [4, 6, 0],
 [4, 6, 1],
 [4, 6, 2],
 [4, 6, 3],
 [4, 6, 4],
 [4, 6, 5],
 [4, 6, 6],
 [4, 6, 7],
 [4, 6, 8],
 [4, 6, 9],
 [4, 6, 10],
 [4, 6, 11],
 [4, 6, 12],
 [4, 7, 0],
 [4, 7, 1],
 [4, 7, 2],
 [4, 7, 3],
 [4, 7, 4],
 [4, 7, 5],
 [4, 7, 6],
 [4, 7, 7],
 [4, 7, 8],
 [4, 7, 9],
 [4, 7, 10],
 [4, 7, 11],
 [4, 7, 12],
 [4, 8, 0],
 [4, 8, 1],
 [4, 8, 2],
 [4, 8, 3],
 [4, 8, 4],
 [4, 8, 5],
 [4, 8, 6],
 [4, 8, 7],
 [4, 8, 8],
 [4, 8, 9],
 [4, 8, 10],
 [4, 8, 11],
 

In [18]:
answers

{(2,
  2): (['Manchester United',
   'Barcelona',
   'Neymar',
   'Brazil',
   'Louis van Gaal',
   'Old Trafford',
   'Neymar',
   'Manchester United',
   'Robin van Persie',
   'Radamel Falcao',
   'Javier Hernandez'], ['Manchester United', 'Manchester United']),
 (3,
  3): (['Germany',
   'New Year',
   'Munich',
   'ISIS',
   'Hauptbahnhof',
   'Pasing',
   'Hubertus Andrae'], ['Munich']),
 (4,
  4): (['Palm Beach',
   'Florida',
   'CNN',
   'Donald Trump',
   'White House',
   'Washington',
   'Palm Beach',
   'Winter White House',
   'Florida',
   'East Coast',
   'Mar',
   'Lago',
   'Trump'], ['Mar']),
 (4,
  5): (['Palm Beach',
   'Florida',
   'CNN',
   'Donald Trump',
   'White House',
   'Washington',
   'Palm Beach',
   'Winter White House',
   'Florida',
   'East Coast',
   'Mar',
   'Lago',
   'Trump'], ['Lago']),
 (4,
  6): (['Palm Beach',
   'Florida',
   'CNN',
   'Donald Trump',
   'White House',
   'Washington',
   'Palm Beach',
   'Winter White House',
   'Florida

In [20]:
preds[11:18]

array([[-0.84111613,  0.7519106 ],
       [ 3.8114934 , -4.457633  ],
       [-1.2014596 ,  1.3730025 ],
       [ 4.1645846 , -4.893137  ],
       [ 2.4763541 , -2.872811  ],
       [ 2.8611062 , -3.4158652 ],
       [ 4.179798  , -4.9314504 ]], dtype=float32)

In [21]:
inputs

{'labels': tensor([0, 0, 1, 0, 0, 0, 1, 0], device='cuda:0'),
 'input_ids': tensor([[ 101,  138, 1300,  ...,    0,    0,    0],
         [ 101,  138, 1300,  ...,    0,    0,    0],
         [ 101,  138, 1300,  ...,    0,    0,    0],
         ...,
         [ 101,  138, 1300,  ...,    0,    0,    0],
         [ 101,  138, 1300,  ...,    0,    0,    0],
         [ 101,  138, 1300,  ...,    0,    0,    0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0')}