In [15]:
import argparse
import glob
import logging
import os
import random
import timeit

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from transformers import (
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    squad_convert_examples_to_features,
)
from transformers.data.metrics.squad_metrics import (
    compute_predictions_log_probs,
    compute_predictions_logits,
    squad_evaluate,
)
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor

In [16]:
from torch.utils.tensorboard import SummaryWriter

In [17]:
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)


In [18]:
parser = argparse.ArgumentParser()

# Required parameters
parser.add_argument(
    "--model_type",
    default=None,
    type=str,
    required=True,
    help="Model type selected in the list: " + ", ".join(MODEL_TYPES),
)
parser.add_argument(
    "--model_name_or_path",
    default=None,
    type=str,
    required=True,
    help="Path to pretrained model or model identifier from huggingface.co/models",
)
parser.add_argument(
    "--output_dir",
    default=None,
    type=str,
    required=True,
    help="The output directory where the model checkpoints and predictions will be written.",
)

# Other parameters
parser.add_argument(
    "--data_dir",
    default=None,
    type=str,
    help="The input data dir. Should contain the .json files for the task."
    + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
)
parser.add_argument(
    "--train_file",
    default=None,
    type=str,
    help="The input training file. If a data dir is specified, will look for the file there"
    + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
)
parser.add_argument(
    "--predict_file",
    default=None,
    type=str,
    help="The input evaluation file. If a data dir is specified, will look for the file there"
    + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
)
parser.add_argument(
    "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
)
parser.add_argument(
    "--tokenizer_name",
    default="",
    type=str,
    help="Pretrained tokenizer name or path if not the same as model_name",
)
parser.add_argument(
    "--cache_dir",
    default="",
    type=str,
    help="Where do you want to store the pre-trained models downloaded from s3",
)

parser.add_argument(
    "--version_2_with_negative",
    action="store_true",
    help="If true, the SQuAD examples contain some that do not have an answer.",
)
parser.add_argument(
    "--null_score_diff_threshold",
    type=float,
    default=0.0,
    help="If null_score - best_non_null is greater than the threshold predict null.",
)

parser.add_argument(
    "--max_seq_length",
    default=384,
    type=int,
    help="The maximum total input sequence length after WordPiece tokenization. Sequences "
    "longer than this will be truncated, and sequences shorter than this will be padded.",
)
parser.add_argument(
    "--doc_stride",
    default=128,
    type=int,
    help="When splitting up a long document into chunks, how much stride to take between chunks.",
)
parser.add_argument(
    "--max_query_length",
    default=64,
    type=int,
    help="The maximum number of tokens for the question. Questions longer than this will "
    "be truncated to this length.",
)
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
parser.add_argument(
    "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
)
parser.add_argument(
    "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
)

parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument(
    "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
)
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument(
    "--gradient_accumulation_steps",
    type=int,
    default=1,
    help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
    "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
)
parser.add_argument(
    "--max_steps",
    default=-1,
    type=int,
    help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
)
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
parser.add_argument(
    "--n_best_size",
    default=20,
    type=int,
    help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
)
parser.add_argument(
    "--max_answer_length",
    default=30,
    type=int,
    help="The maximum length of an answer that can be generated. This is needed because the start "
    "and end predictions are not conditioned on one another.",
)
parser.add_argument(
    "--verbose_logging",
    action="store_true",
    help="If true, all of the warnings related to data processing will be printed. "
    "A number of warnings are expected for a normal SQuAD evaluation.",
)
parser.add_argument(
    "--lang_id",
    default=0,
    type=int,
    help="language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)",
)

parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
parser.add_argument(
    "--eval_all_checkpoints",
    action="store_true",
    help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
)
parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
parser.add_argument(
    "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
)
parser.add_argument(
    "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
)
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
parser.add_argument(
    "--fp16",
    action="store_true",
    help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
)
parser.add_argument(
    "--fp16_opt_level",
    type=str,
    default="O1",
    help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
    "See details at https://nvidia.github.io/apex/amp.html",
)
parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")


_StoreAction(option_strings=['--threads'], dest='threads', nargs=None, const=None, default=1, type=<class 'int'>, choices=None, help='multiple threads for converting example to features', metavar=None)

In [19]:
"""
--model_type bert \
        --model_name_or_path bert-base-uncased \
        --do_train \
        --do_eval \
        --do_lower_case \
        --train_file $DATA_DIR/train-v1.1.json \
        --predict_file $DATA_DIR/dev-v1.1.json \
        --per_gpu_train_batch_size 16 \
        --learning_rate 3e-5 \
        --num_train_epochs 2.0 \
        --max_seq_length 384 \
        --doc_stride 128 \
        --output_dir $EXPERIMENT_DIR/$MODEL_NAME/$DATESTAMP \
        --threads 12
"""

'\n--model_type bert         --model_name_or_path bert-base-uncased         --do_train         --do_eval         --do_lower_case         --train_file $DATA_DIR/train-v1.1.json         --predict_file $DATA_DIR/dev-v1.1.json         --per_gpu_train_batch_size 16         --learning_rate 3e-5         --num_train_epochs 2.0         --max_seq_length 384         --doc_stride 128         --output_dir $EXPERIMENT_DIR/$MODEL_NAME/$DATESTAMP         --threads 12\n'

In [20]:
MODEL_TYPE = "bert"
MODEL_NAME = "bert-base-cased"
DATESTAMP = "20200810"
SQUAD_DIR = "/home/keyur/medhas/squad_data/"
TASK_NAME = "squad"
PER_DEVICE_BATCH_SIZE = 24
EXPERIMENT_DIR="/mnt/data/medhas/squad_experiments/%s/%s"%(MODEL_NAME, DATESTAMP)


custom_sysargv = [
"--model_type=%s"%MODEL_TYPE,
"--model_name_or_path=%s"%MODEL_NAME,
"--do_train",
"--do_eval",
"--do_lower_case",
"--train_file=%s/train-v1.1.json"%SQUAD_DIR,
"--predict_file=%s/dev-v1.1.json"%SQUAD_DIR,
"--max_seq_length=512",
"--per_gpu_train_batch_size=%s"%PER_DEVICE_BATCH_SIZE,
"--learning_rate=3e-5",
"--num_train_epochs=1",
"--max_seq_length=384",
"--doc_stride=128",
"--output_dir=%s"%EXPERIMENT_DIR,
"--logging_steps=565",
"--evaluate_during_training",
"--save_steps=1000",
"--gradient_accumulation_steps=1",
"--overwrite_output_dir",
"--threads=12"
]

In [21]:
args = parser.parse_args(args=custom_sysargv)

In [22]:
args

Namespace(adam_epsilon=1e-08, cache_dir='', config_name='', data_dir=None, do_eval=True, do_lower_case=True, do_train=True, doc_stride=128, eval_all_checkpoints=False, evaluate_during_training=True, fp16=False, fp16_opt_level='O1', gradient_accumulation_steps=1, lang_id=0, learning_rate=3e-05, local_rank=-1, logging_steps=565, max_answer_length=30, max_grad_norm=1.0, max_query_length=64, max_seq_length=384, max_steps=-1, model_name_or_path='bert-base-cased', model_type='bert', n_best_size=20, no_cuda=False, null_score_diff_threshold=0.0, num_train_epochs=1.0, output_dir='/mnt/data/medhas/squad_experiments/bert-base-cased/20200810', overwrite_cache=False, overwrite_output_dir=True, per_gpu_eval_batch_size=8, per_gpu_train_batch_size=24, predict_file='/home/keyur/medhas/squad_data//dev-v1.1.json', save_steps=1000, seed=42, server_ip='', server_port='', threads=12, tokenizer_name='', train_file='/home/keyur/medhas/squad_data//train-v1.1.json', verbose_logging=False, version_2_with_negativ

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
args.device = device

In [24]:
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

set_seed(args)

In [25]:
args.model_type = args.model_type.lower()
config = AutoConfig.from_pretrained(
    args.config_name if args.config_name else args.model_name_or_path,
    cache_dir=args.cache_dir if args.cache_dir else None,
)

In [26]:
tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

In [27]:
model = AutoModelForQuestionAnswering.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

In [28]:
if args.local_rank == 0:
    # Make sure only the first process in distributed training will download model & vocab
    torch.distributed.barrier()

In [29]:
model.to(args.device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [30]:
input_dir = args.data_dir if args.data_dir else "."

In [31]:
processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
examples = processor.get_train_examples(args.data_dir, filename=args.train_file)

100%|██████████| 442/442 [00:26<00:00, 16.94it/s]


In [57]:
features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True,
            return_dataset="pt",
            threads=args.threads,
        )

convert squad examples to features: 100%|██████████| 87599/87599 [01:19<00:00, 1101.36it/s]
add example index and unique id: 100%|██████████| 87599/87599 [00:00<00:00, 916704.10it/s]


In [63]:
examples1 = examples[100:110]
features1, dataset1 = squad_convert_examples_to_features(
            examples=examples1,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True,
            return_dataset="pt",
        )

convert squad examples to features: 100%|██████████| 10/10 [00:00<00:00, 55.21it/s]
add example index and unique id: 100%|██████████| 10/10 [00:00<00:00, 21765.98it/s]


In [43]:
for i in range(len(features1)-1):
    if (features1[i].qas_id == features1[i+1].qas_id):
        print(i)

3
5
7
9
11


In [102]:
def get_all_doc_tokens(i):
    example = examples1[i]
    all_doc_tokens = []
    for (i, token) in enumerate(example.doc_tokens):
        sub_tokens = tokenizer.tokenize(token)
        for sub_token in sub_tokens:
            all_doc_tokens.append(sub_token)
    return all_doc_tokens


def print_squad_feature(i):
    print("Question Id: ", features1[i].qas_id)
    print("paragraph_len: ", features1[i].paragraph_len)
    print("tokens: ", len(features1[i].tokens))
    print("start_position-end_position: ", features1[i].start_position, features1[i].end_position)
    print("CLS index: ", features1[i].cls_index)
    print("Complete context length: ", len(get_all_doc_tokens(features1[i].example_index)))
    print("Text: ", " ".join(tokenizer.convert_ids_to_tokens(features1[i].input_ids)))
    print("Complete Context:", " ".join(get_all_doc_tokens(features1[i].example_index)))
    


In [103]:
print_squad_feature(3)

Question Id:  573388ce4776f41900660cc3
paragraph_len:  360
tokens:  384
start_position-end_position:  89 92
CLS index:  0
Complete context length:  368
Text:  [CLS] cat ##hol ##ic people identified with not ##re dam ##e , what religious group did people feel ya ##le represented ? [SEP] the success of its football team made not ##re dam ##e a household name . the success of note dam ##e reflected rising status of i ##ris ##h am ##eric ##ans and cat ##hol ##ics in the 1920s . cat ##hol ##ics rallied up around the team and listen to the games on the radio , especially when it knocked off the schools that symbol ##ized the protest ##ant establishment in am ##eric ##a — ha ##rva ##rd , ya ##le , prince ##ton , and army . yet this role as high - profile flagship institution of cat ##hol ##ici ##sm made it an easy target of anti - cat ##hol ##ici ##sm . the most remarkable episode of violence was the clash between not ##re dam ##e students and the k ##u k ##lux k ##lan in 1924 . na ##ti ##vis

In [104]:
print_squad_feature(4)

Question Id:  573388ce4776f41900660cc3
paragraph_len:  240
tokens:  264
start_position-end_position:  0 0
CLS index:  0
Complete context length:  368
Text:  [CLS] cat ##hol ##ic people identified with not ##re dam ##e , what religious group did people feel ya ##le represented ? [SEP] ##e students and the k ##u k ##lux k ##lan in 1924 . na ##ti ##vis ##m and anti - cat ##hol ##ici ##sm , especially when directed towards immigrants , were corners ##tones of the k ##k ##k ' s rhetoric , and not ##re dam ##e was seen as a symbol of the threat posed by the cat ##hol ##ic church . the k ##lan decided to have a week - long k ##lav ##ern in south bend . clashes with the student body started on march 17 , when students , aware of the anti - cat ##hol ##ic an ##imo ##sity , blocked the k ##lan ##sm ##en from descending from their trains in the south bend station and ripped the k ##k ##k clothes and re ##gal ##ia . on may 19 thousands of students mass ##ed downtown protesting the k ##lav ##ern , 