In [1]:
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

In [2]:
from src.transformers import modeling_with_arc_xlnet

In [3]:
#from src.transformers import modeling_xlnet

In [4]:
import argparse
import glob
import logging
import os
import random

In [5]:
#!pip install spacy

In [6]:
from kg_loader import KG
from graph_utils import GraphEncoder

In [7]:
from src.transformers.tokenization_xlnet import XLNetTokenizer
from src.transformers.tokenization_bert import BertTokenizer
from src.transformers.tokenization_roberta import RobertaTokenizer

In [8]:
from src.transformers.modeling_with_arc_xlnet import XLNetForMultipleChoice
from src.transformers.modeling_roberta import RobertaForMultipleChoice
from src.transformers.modeling_bert import BertForMultipleChoice

In [9]:
from src.transformers.optimization import AdamW, get_linear_schedule_with_warmup

In [10]:
from src.transformers.configuration_xlnet import XLNetConfig
from src.transformers.configuration_bert import BertConfig
from src.transformers.configuration_roberta import RobertaConfig

In [11]:
from src.transformers.file_utils import WEIGHTS_NAME

In [12]:
from utils_arc import convert_examples_to_features, processors

In [13]:
from src.transformers.tokenization_utils import PreTrainedTokenizer

In [14]:
try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

In [15]:
logger = logging.getLogger(__name__)

In [16]:
ALL_MODELS = sum(
    (
        tuple(conf.pretrained_config_archive_map.keys())
        for conf in (BertConfig, RobertaConfig, XLNetConfig)
    ),
    (),
)

In [17]:
MODEL_CLASSES = {
    "bert" : (BertConfig, BertForMultipleChoice, BertTokenizer),
    "XLNet" : (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
    "Roberta" : (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer),
}

In [18]:
def select_field(features, field):
    return [
        [
            choice[field] 
            for choice in feature.choices_features
        ]
        for feature in features
]

In [19]:
preds = np.array([1,4,2,3,0,3,2,1,2,3,3,2,4,1,2,1,3,2,2,2,1,1,3,3,2,2,1,3,3,2,2,3,1,3]).astype('int64')
labels = np.array([3,2,4,1,1,3,1,2,2,4,1,2,4,1,4,1,4,1,2,3,4,1,4,1,4,1,2,3,2,3,1,2,3,1]).astype('int64')

In [20]:
def simple_accuracy(preds, labels):
    print("labels {} \n".format(labels))
    print("label.shape {} \n".format(labels.shape))
    print("label data type {} \n".format(labels.dtype))
    print("preds shape {} \n".format(preds.shape))
    print("preds data type {} \n".format(preds.dtype))
    print(preds)
    return (preds == labels).mean()

In [21]:
x = simple_accuracy(preds,labels)

labels [3 2 4 1 1 3 1 2 2 4 1 2 4 1 4 1 4 1 2 3 4 1 4 1 4 1 2 3 2 3 1 2 3 1] 

label.shape (34,) 

label data type int64 

preds shape (34,) 

preds data type int64 

[1 4 2 3 0 3 2 1 2 3 3 2 4 1 2 1 3 2 2 2 1 1 3 3 2 2 1 3 3 2 2 3 1 3]


In [22]:
x

0.2647058823529412

In [23]:
#def simple_accuracy(out, labels):
    #outputs = np.argmax(out, axis=0)
    #return np.sum(outputs == labels)

In [24]:
seed = 42
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

In [25]:
def train(train_dataset, model, tokenizer):
    # Train the model
    if local_rank in [-1,0]:
        tb_writer = SummaryWriter()

    train_batch_size = per_gpu_train_batch_size * max(1, n_gpu)
    train_sampler = RandomSampler(train_dataset) if local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler = train_sampler, batch_size = train_batch_size)
    
    num_train_epochs = 3.0
    if max_steps > 0:
        t_total = max_steps
        num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1

    else:
        t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs

    # Prepare optimizer and schedule ( Linear warmpu and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate, eps = adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps = t_total
    )

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True
        )

    # Train
    logger.info("******* Running Training *********")
    logger.info(" Num Examples = %d", len(train_dataset))
    logger.info(" Num Epochs = %d", num_train_epochs)
    logger.info(" Instantaneous batch size per GPU = %d", per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        train_batch_size
        * gradient_accumulation_steps
        * (torch.distributed.get_world_size() if local_rank != -1 else 1),
    )

    logger.info(" Gradient Accumulation steps = %d", gradient_accumulation_steps)
    logger.info(" Total optimization steps =%d",t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_dev_acc = 0.0
    best_steps = 0
    model.zero_grad()
    train_iterator = trange(int(num_train_epochs), desc ="Epoch", 
                            disable=local_rank not in [-1, 0])
    set_seed(seed) # Added here for reproductibility
    
    ent = torch.from_numpy(np.stack([kg.kg_embeddings['ent_embeddings'] 
                                      for _ in range(1)],axis=0)).to(device)
    rel = torch.from_numpy(np.stack([kg.kg_embeddings['rel_matrices'] 
                                      for _ in range(1)],axis=0)).to(device)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", 
                              disable=local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2]
                if model_type in ["bert", "xlnet"]
                else None,  # XLM don't use segment_ids
                
                "gpre" : batch[3],
                "ghyp" : batch[4],
                "ent" : ent,
                "rel" : rel,
                "labels": batch[5],
            }

            outputs = model(**inputs)
            loss = outputs[0] # model outputs are always tuple in transformers

            if n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel training
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            if fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            tr_loss += loss.item()
            if(step +1) % gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step() # update learning rate scheduler
                model.zero_grad()
                global_step += 1

                if local_rank in [-1,0] and logging_steps > 0 and global_step % logging_steps == 0:
                    # Log metrics
                    # Log metrics
                    if (
                        local_rank == -1 and evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                        if results["eval_acc"] > best_dev_acc:
                            best_dev_acc = results["eval_acc"]
                            best_steps = global_step
                            if do_test:
                                results_test = evaluate(model, tokenizer, test=True)
                                for key, value in results_test.items():
                                    tb_writer.add_scalar("test_{}".format(key), value, global_step)
                                logger.info(
                                    "test acc: %s, loss: %s, global steps: %s",
                                    str(results_test["eval_acc"]),
                                    str(results_test["eval_loss"]),
                                    str(global_step),
                                )
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / logging_steps, global_step)
                    logger.info(
                        "Average loss: %s at global step: %s",
                        str((tr_loss - logging_loss) / logging_steps),
                        str(global_step),
                    )
                    logging_loss = tr_loss

                if local_rank in [-1, 0] and save_steps > 0 and global_step % save_steps == 0:
                    # Save model checkpoint
                    print(output_dir)
                    output_dir1 = os.path.join(output_dir, "checkpoint-{}".format(global_step))
                    print(output_dir1)
                    if not os.path.exists(output_dir1):
                        os.makedirs(output_dir1)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir1)
                    tokenizer.save_vocabulary(output_dir1)
                    torch.save(output_dir,os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir1)
            if max_steps > 0 and global_step > max_steps:
                epoch_iterator.close()
                break
    if local_rank in [-1,0]:
        tb_writer.close()

    return global_step, tr_loss / global_step, best_steps

In [26]:
def evaluate(model, tokenizer, prefix="", test = False):
    eval_task_names = (task_name,)
    eval_outputs_dirs = (output_dir,)

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset = load_and_cache_examples(
            eval_task, tokenizer, evaluate=True, test=False)

        if not os.path.exists(eval_output_dir) and local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        eval_batch_size = per_gpu_eval_batch_size * max(1, n_gpu)
        print("eval_batch_size {} \n".format(eval_batch_size))
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size)

        # multi-gpu evaluate
        if n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        ent = torch.from_numpy(np.stack([kg.kg_embeddings['ent_embeddings'] 
                                          for _ in range(1)],axis=0)).to(device)
        rel = torch.from_numpy(np.stack([kg.kg_embeddings['rel_matrices'] 
                                          for _ in range(1)],axis=0)).to(device)
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2]
                    if model_type in ["bert", "xlnet"]
                    else None,  # XLM don't use segment_ids
                    "gpre" : batch[3],
                    "ghyp" : batch[4],
                    "ent" : ent,
                    "rel" : rel,
                    "labels": batch[5],
                }
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        preds = np.argmax(preds, axis=1)
        acc = simple_accuracy(preds, out_label_ids)
        result = {"eval_acc": acc, "eval_loss": eval_loss}
        results.update(result)

        output_eval_file = os.path.join(eval_output_dir, "is_test_" + str(test).lower() + "_eval_results.txt")

        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(str(prefix) + " is test:" + str(test)))
            writer.write("model           =%s\n" % str(model_name_or_path))
            writer.write(
                "total batch size=%d\n"
                % (
                    per_gpu_train_batch_size
                    * gradient_accumulation_steps
                    * (torch.distributed.get_world_size() if local_rank != -1 else 1)
                )
            )
            writer.write("train num epochs=%d\n" % num_train_epochs)
            writer.write("fp16            =%s\n" % fp16)
            writer.write("max seq length  =%d\n" % max_seq_length)
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
    return results


In [27]:
def load_and_cache_examples(task, tokenizer, evaluate=False, test=False):
    if local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = processors[task]()
    # Load data features from cache or dataset file
    if evaluate:
        cached_mode = "dev"
    elif test:
        cached_mode = "test"
    else:
        cached_mode = "train"
    assert not (evaluate and test)
    cached_features_file = os.path.join(
        data_dir,
        "cached_{}_{}_{}_{}".format(
            cached_mode,
            list(filter(None, model_name_or_path.split("/"))).pop(),
            str(max_seq_length),
            str(task),
        ),
    )
    if os.path.exists(cached_features_file) and not overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", data_dir)
        label_list = processor.get_labels()
        if evaluate:
            examples = processor.get_dev_examples(data_dir)
        elif test:
            examples = processor.get_test_examples(data_dir)
        else:
            examples = processor.get_train_examples(data_dir)
        logger.info("Training number: %s", str(len(examples)))
        features = convert_examples_to_features(
                    examples,
                    label_list,
                    max_seq_length,
                    tokenizer,
                    graph_encoder,
                    kg,
                    pad_on_left=bool(model_type in ["xlnet"]),
                    pad_token_segment_id=4 if model_type in ["xlnet"] else 0,
                    
                )
        #print(features[0].choices_features)
        if local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
    print("all_input_ids.shape {} \n".format(all_input_ids.shape))
    all_input_mask = torch.tensor(select_field(features, "attention_mask"), dtype=torch.long)
    print("all_input_mask.shape {} \n".format(all_input_mask.shape))
    all_segment_ids = torch.tensor(select_field(features, "token_type_ids"), dtype =torch.long)
    print("all_segment_ids.shape {} \n".format(all_segment_ids.shape))
    all_prem_ids = torch.tensor(select_field(features,"gpre"), dtype = torch.long)
    print("all_prem_ids.shape {} \n".format(all_prem_ids.shape))
    all_hyp_ids = torch.tensor(select_field(features,"ghyp"),dtype = torch.long)
    print("all_hyp_ids.shape {} \n".format(all_hyp_ids.shape))
    all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)
    print("all_label_ids.shape {} \n".format(all_label_ids.shape))
    # to do changes here:
    dataset = TensorDataset(all_input_ids,all_input_mask, all_segment_ids, all_prem_ids, all_hyp_ids, all_label_ids,)
        
    return dataset

In [28]:
task_name = 'arc'

In [29]:
data_dir = "data/arc/processed"

In [30]:
model_type = "XLNet"

In [31]:
MODEL_CLASSES.keys()

dict_keys(['bert', 'XLNet', 'Roberta'])

In [32]:
model_name_or_path = 'xlnet-base-cased'
do_lower_case = False # set true when using uncased model

In [33]:
#ALL_MODELS

In [34]:
processors.keys()

dict_keys(['arc', 'race', 'swag'])

In [35]:
task_name = 'arc'

In [36]:
output_dir = "save/"

In [37]:
config_name = ""

In [38]:
do_train = True
do_eval = True
do_test = True

In [39]:
evaluate_during_training = False

In [40]:
per_gpu_train_batch_size = 1

In [41]:
per_gpu_eval_batch_size = 1

In [42]:
gradient_accumulation_steps = 1

In [43]:
learning_rate = 5e-5

In [44]:
weight_decay = 0.0

In [45]:
adam_epsilon = 1e-8

In [46]:
max_grad_norm = 1.0

In [47]:
num_train_epochs = 3.0

In [48]:
#If > 0: set total number of training steps to perform. Override num_train_epochs.
max_steps = -1

In [49]:
warmup_steps = 0

In [50]:
logging_steps = 50

In [51]:
save_steps = 1000

In [52]:
eval_all_checkpoints = True

In [53]:
no_cuda = False

In [54]:
cache_dir = ""

In [55]:
overwrite_output_dir = True

In [56]:
overwrite_cache = True

In [57]:
fp16 = False

In [58]:
fp16_opt_level = False

In [59]:
path_to_kg = "data/conceptnet"

In [60]:
max_ent_pre = 262

In [61]:
max_ent_hyp = 83

In [62]:
local_rank = -1

In [63]:
server_ip = ""
server_port = ""

In [64]:
if (
        os.path.exists(output_dir)
        and os.listdir(output_dir)
        and do_train
        and not overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                output_dir
            )
        )

In [65]:
if server_ip and server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(server_ip, server_port), redirect_output=True)
        ptvsd.wait_for_attach()

In [66]:
if local_rank == -1 or no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
else:
    torch.cuda.set_device(local_rank)
    device = torch.device("cuda", local_rank)
    torch.distributed.init_process_group(backend="nccl")
    n_gpu = 1
device = device

In [67]:
device

device(type='cuda')

In [68]:
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if local_rank in [-1, 0] else logging.WARN,
    )
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    local_rank,
    device,
    n_gpu,
    bool(local_rank != -1),
    fp16,
)



In [69]:
set_seed(seed)

In [70]:
task_name = task_name.lower()

In [71]:
if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
processor = processors[task_name]()
label_list = processor.get_labels()
num_labels = len(label_list)

In [72]:
label_list

['0', '1', '2', '3']

In [73]:
num_labels

4

In [74]:
if local_rank not in [-1, 0]:
    torch.distributed.barrier() # Make sure only the first process in distributed training will download model and vocab

In [75]:
model_type = model_type.lower()
config_class, model_class, tokenizer_class = MODEL_CLASSES['XLNet']

In [76]:
config = config_class.from_pretrained(
    config_name if config_name else model_name_or_path,
    num_labels=num_labels,
    finetuning_task=task_name,
    cache_dir=cache_dir if cache_dir else None,
)

01/07/2020 14:18:04 - INFO - filelock -   Lock 139974592477968 acquired on /home/abhishek/.cache/torch/transformers/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.ef1824921bc0786e97dc88d55eb17aabf18aac90f24bd34c0650529e7ba27d6f.lock
01/07/2020 14:18:04 - INFO - filelock -   Lock 139974592477968 released on /home/abhishek/.cache/torch/transformers/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.ef1824921bc0786e97dc88d55eb17aabf18aac90f24bd34c0650529e7ba27d6f.lock
01/07/2020 14:18:04 - INFO - src.transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json from cache at /home/abhishek/.cache/torch/transformers/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.ef1824921bc0786e97dc88d55eb17aabf18aac90f24bd34c0650529e7ba27d6f
01/07/2020 14:18:04 - INFO - src.transformers.configuration_utils -   Model config {
  "attn_type": "bi",
  "bi_data": false,
  "bos_to

In [77]:
tokenizer_name=""

In [78]:
max_ent_pre = 262
max_ent_hyp = 83

In [79]:
tokenizer = tokenizer_class.from_pretrained(
    tokenizer_name if tokenizer_name else model_name_or_path,
    do_lower_case=do_lower_case,
    cache_dir=cache_dir if cache_dir else None,

)

01/07/2020 14:18:05 - INFO - filelock -   Lock 139974592503056 acquired on /home/abhishek/.cache/torch/transformers/dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8.lock
01/07/2020 14:18:05 - INFO - filelock -   Lock 139974592503056 released on /home/abhishek/.cache/torch/transformers/dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8.lock
01/07/2020 14:18:05 - INFO - src.transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model from cache at /home/abhishek/.cache/torch/transformers/dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8


In [80]:
kg = KG(path_to_kg)
graph_encoder = GraphEncoder(kg)

In [81]:
model = model_class.from_pretrained(
    model_name_or_path,
    kg = kg,
    from_tf=bool(".ckpt" in model_name_or_path),
    config = config,
    cache_dir=cache_dir if cache_dir else None,
    )

01/07/2020 14:18:08 - INFO - filelock -   Lock 139973964894352 acquired on /home/abhishek/.cache/torch/transformers/24197ba0ce5dbfe23924431610704c88e2c0371afa49149360e4c823219ab474.7eac4fe898a021204e63c88c00ea68c60443c57f94b4bc3c02adbde6465745ac.lock
01/07/2020 14:18:08 - INFO - filelock -   Lock 139973964894352 released on /home/abhishek/.cache/torch/transformers/24197ba0ce5dbfe23924431610704c88e2c0371afa49149360e4c823219ab474.7eac4fe898a021204e63c88c00ea68c60443c57f94b4bc3c02adbde6465745ac.lock
01/07/2020 14:18:08 - INFO - src.transformers.modeling_utils -   loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin from cache at /home/abhishek/.cache/torch/transformers/24197ba0ce5dbfe23924431610704c88e2c0371afa49149360e4c823219ab474.7eac4fe898a021204e63c88c00ea68c60443c57f94b4bc3c02adbde6465745ac


num_entities 1100498
num_relations 38


01/07/2020 14:18:17 - INFO - src.transformers.modeling_utils -   Weights of XLNetForMultipleChoice not initialized from pretrained model: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias', 'gmatch.ent_emb.weight', 'gmatch.linear1.weight', 'gmatch.linear1.bias', 'EntAttn.linear.weight', 'EntAttn.linear.bias', 'RelAttn.linear.weight', 'RelAttn.linear.bias', 'RelAttn.linear2.weight', 'RelAttn.linear2.bias']
01/07/2020 14:18:17 - INFO - src.transformers.modeling_utils -   Weights from pretrained model not used in XLNetForMultipleChoice: ['lm_loss.weight', 'lm_loss.bias']


In [82]:
model_class

src.transformers.modeling_with_arc_xlnet.XLNetForMultipleChoice

In [83]:
model.to(device)

XLNetForMultipleChoice(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, ele

In [84]:
logger.info("Training/evaluation parameters %s")
best_step = 0

01/07/2020 14:18:20 - INFO - __main__ -   Training/evaluation parameters %s


In [85]:
max_seq_length = 512

In [86]:
if do_train:
    train_dataset = load_and_cache_examples(task_name,tokenizer, evaluate =False)
    global_step, tr_loss, best_steps = train(train_dataset, model, tokenizer)
    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

01/07/2020 14:18:20 - INFO - __main__ -   Creating features from dataset file at data/arc/processed
01/07/2020 14:18:20 - INFO - utils_arc -   LOOKING AT data/arc/processed train
01/07/2020 14:18:20 - INFO - __main__ -   Training number: 3353
01/07/2020 14:18:56 - INFO - __main__ -   Saving features into cached file data/arc/processed/cached_train_xlnet-base-cased_512_arc


all_input_ids.shape torch.Size([3353, 4, 512]) 

all_input_mask.shape torch.Size([3353, 4, 512]) 

all_segment_ids.shape torch.Size([3353, 4, 512]) 

all_prem_ids.shape torch.Size([3353, 4, 262]) 

all_hyp_ids.shape torch.Size([3353, 4, 83]) 

all_label_ids.shape torch.Size([3353]) 



01/07/2020 14:19:06 - INFO - __main__ -   ******* Running Training *********
01/07/2020 14:19:06 - INFO - __main__ -    Num Examples = 3353
01/07/2020 14:19:06 - INFO - __main__ -    Num Epochs = 3
01/07/2020 14:19:06 - INFO - __main__ -    Instantaneous batch size per GPU = 1
01/07/2020 14:19:06 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 4
01/07/2020 14:19:06 - INFO - __main__ -    Gradient Accumulation steps = 1
01/07/2020 14:19:06 - INFO - __main__ -    Total optimization steps =2517
Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 1/839 [00:06<1:35:25,  6.83s/it][A
Iteration:   0%|          | 2/839 [00:07<1:09:40,  4.99s/it][A
Iteration:   0%|          | 3/839 [00:08<51:38,  3.71s/it]  [A
Iteration:   0%|          | 4/839 [00:08<39:01,  2.80s/it][A
Iteration:   1%|          | 5/839 [00:09<30:22,  2.18s/it][A
Iteration:   1%|          | 6/839 [00:10<24:16,  1.75s/it][A
Iteration:   1%|          | 7/839 [0

Iteration:  13%|█▎        | 109/839 [01:24<08:38,  1.41it/s][A
Iteration:  13%|█▎        | 110/839 [01:24<08:37,  1.41it/s][A
Iteration:  13%|█▎        | 111/839 [01:25<08:37,  1.41it/s][A
Iteration:  13%|█▎        | 112/839 [01:26<08:39,  1.40it/s][A
Iteration:  13%|█▎        | 113/839 [01:26<08:36,  1.40it/s][A
Iteration:  14%|█▎        | 114/839 [01:27<08:35,  1.41it/s][A
Iteration:  14%|█▎        | 115/839 [01:28<08:35,  1.40it/s][A
Iteration:  14%|█▍        | 116/839 [01:29<08:37,  1.40it/s][A
Iteration:  14%|█▍        | 117/839 [01:29<08:37,  1.39it/s][A
Iteration:  14%|█▍        | 118/839 [01:30<08:37,  1.39it/s][A
Iteration:  14%|█▍        | 119/839 [01:31<08:36,  1.39it/s][A
Iteration:  14%|█▍        | 120/839 [01:31<08:36,  1.39it/s][A
Iteration:  14%|█▍        | 121/839 [01:32<08:33,  1.40it/s][A
Iteration:  15%|█▍        | 122/839 [01:33<08:33,  1.40it/s][A
Iteration:  15%|█▍        | 123/839 [01:34<08:32,  1.40it/s][A
Iteration:  15%|█▍        | 124/839 [01:

Iteration:  28%|██▊       | 234/839 [02:54<07:21,  1.37it/s][A
Iteration:  28%|██▊       | 235/839 [02:54<07:19,  1.37it/s][A
Iteration:  28%|██▊       | 236/839 [02:55<07:16,  1.38it/s][A
Iteration:  28%|██▊       | 237/839 [02:56<07:14,  1.38it/s][A
Iteration:  28%|██▊       | 238/839 [02:57<07:11,  1.39it/s][A
Iteration:  28%|██▊       | 239/839 [02:57<07:09,  1.40it/s][A
Iteration:  29%|██▊       | 240/839 [02:58<07:08,  1.40it/s][A
Iteration:  29%|██▊       | 241/839 [02:59<07:08,  1.40it/s][A
Iteration:  29%|██▉       | 242/839 [02:59<07:07,  1.40it/s][A
Iteration:  29%|██▉       | 243/839 [03:00<07:06,  1.40it/s][A
Iteration:  29%|██▉       | 244/839 [03:01<07:07,  1.39it/s][A
Iteration:  29%|██▉       | 245/839 [03:02<07:05,  1.39it/s][A
Iteration:  29%|██▉       | 246/839 [03:02<07:05,  1.40it/s][A
Iteration:  29%|██▉       | 247/839 [03:03<07:07,  1.38it/s][A
Iteration:  30%|██▉       | 248/839 [03:04<07:05,  1.39it/s][A
Iteration:  30%|██▉       | 249/839 [03:

Iteration:  43%|████▎     | 357/839 [04:23<05:47,  1.39it/s][A
Iteration:  43%|████▎     | 358/839 [04:23<05:47,  1.38it/s][A
Iteration:  43%|████▎     | 359/839 [04:24<05:46,  1.39it/s][A
Iteration:  43%|████▎     | 360/839 [04:25<05:46,  1.38it/s][A
Iteration:  43%|████▎     | 361/839 [04:26<05:47,  1.38it/s][A
Iteration:  43%|████▎     | 362/839 [04:26<05:45,  1.38it/s][A
Iteration:  43%|████▎     | 363/839 [04:27<05:41,  1.39it/s][A
Iteration:  43%|████▎     | 364/839 [04:28<05:40,  1.40it/s][A
Iteration:  44%|████▎     | 365/839 [04:28<05:38,  1.40it/s][A
Iteration:  44%|████▎     | 366/839 [04:29<05:38,  1.40it/s][A
Iteration:  44%|████▎     | 367/839 [04:30<05:40,  1.39it/s][A
Iteration:  44%|████▍     | 368/839 [04:31<05:44,  1.37it/s][A
Iteration:  44%|████▍     | 369/839 [04:31<05:44,  1.37it/s][A
Iteration:  44%|████▍     | 370/839 [04:32<05:45,  1.36it/s][A
Iteration:  44%|████▍     | 371/839 [04:33<05:43,  1.36it/s][A
Iteration:  44%|████▍     | 372/839 [04:

Iteration:  57%|█████▋    | 482/839 [05:53<04:18,  1.38it/s][A
Iteration:  58%|█████▊    | 483/839 [05:54<04:15,  1.39it/s][A
Iteration:  58%|█████▊    | 484/839 [05:55<04:13,  1.40it/s][A
Iteration:  58%|█████▊    | 485/839 [05:55<04:15,  1.38it/s][A
Iteration:  58%|█████▊    | 486/839 [05:56<04:14,  1.38it/s][A
Iteration:  58%|█████▊    | 487/839 [05:57<04:13,  1.39it/s][A
Iteration:  58%|█████▊    | 488/839 [05:58<04:12,  1.39it/s][A
Iteration:  58%|█████▊    | 489/839 [05:58<04:12,  1.39it/s][A
Iteration:  58%|█████▊    | 490/839 [05:59<04:10,  1.39it/s][A
Iteration:  59%|█████▊    | 491/839 [06:00<04:09,  1.40it/s][A
Iteration:  59%|█████▊    | 492/839 [06:00<04:08,  1.40it/s][A
Iteration:  59%|█████▉    | 493/839 [06:01<04:07,  1.40it/s][A
Iteration:  59%|█████▉    | 494/839 [06:02<04:06,  1.40it/s][A
Iteration:  59%|█████▉    | 495/839 [06:03<04:06,  1.39it/s][A
Iteration:  59%|█████▉    | 496/839 [06:03<04:07,  1.38it/s][A
Iteration:  59%|█████▉    | 497/839 [06:

Iteration:  72%|███████▏  | 605/839 [07:22<02:53,  1.35it/s][A
Iteration:  72%|███████▏  | 606/839 [07:23<02:51,  1.36it/s][A
Iteration:  72%|███████▏  | 607/839 [07:24<02:52,  1.34it/s][A
Iteration:  72%|███████▏  | 608/839 [07:24<02:49,  1.36it/s][A
Iteration:  73%|███████▎  | 609/839 [07:25<02:51,  1.34it/s][A
Iteration:  73%|███████▎  | 610/839 [07:26<02:48,  1.36it/s][A
Iteration:  73%|███████▎  | 611/839 [07:27<02:48,  1.35it/s][A
Iteration:  73%|███████▎  | 612/839 [07:27<02:46,  1.36it/s][A
Iteration:  73%|███████▎  | 613/839 [07:28<02:44,  1.38it/s][A
Iteration:  73%|███████▎  | 614/839 [07:29<02:42,  1.38it/s][A
Iteration:  73%|███████▎  | 615/839 [07:30<02:40,  1.39it/s][A
Iteration:  73%|███████▎  | 616/839 [07:30<02:40,  1.39it/s][A
Iteration:  74%|███████▎  | 617/839 [07:31<02:39,  1.39it/s][A
Iteration:  74%|███████▎  | 618/839 [07:32<02:42,  1.36it/s][A
Iteration:  74%|███████▍  | 619/839 [07:32<02:41,  1.37it/s][A
Iteration:  74%|███████▍  | 620/839 [07:

Iteration:  87%|████████▋ | 730/839 [08:53<01:18,  1.39it/s][A
Iteration:  87%|████████▋ | 731/839 [08:53<01:17,  1.39it/s][A
Iteration:  87%|████████▋ | 732/839 [08:54<01:16,  1.39it/s][A
Iteration:  87%|████████▋ | 733/839 [08:55<01:16,  1.39it/s][A
Iteration:  87%|████████▋ | 734/839 [08:56<01:15,  1.39it/s][A
Iteration:  88%|████████▊ | 735/839 [08:56<01:14,  1.39it/s][A
Iteration:  88%|████████▊ | 736/839 [08:57<01:14,  1.39it/s][A
Iteration:  88%|████████▊ | 737/839 [08:58<01:14,  1.38it/s][A
Iteration:  88%|████████▊ | 738/839 [08:58<01:13,  1.38it/s][A
Iteration:  88%|████████▊ | 739/839 [08:59<01:12,  1.37it/s][A
Iteration:  88%|████████▊ | 740/839 [09:00<01:11,  1.38it/s][A
Iteration:  88%|████████▊ | 741/839 [09:01<01:11,  1.37it/s][A
Iteration:  88%|████████▊ | 742/839 [09:01<01:10,  1.38it/s][A
Iteration:  89%|████████▊ | 743/839 [09:02<01:09,  1.38it/s][A
Iteration:  89%|████████▊ | 744/839 [09:03<01:08,  1.39it/s][A
Iteration:  89%|████████▉ | 745/839 [09:

Iteration:   2%|▏         | 13/839 [00:09<09:54,  1.39it/s][A
Iteration:   2%|▏         | 14/839 [00:10<09:53,  1.39it/s][A
Iteration:   2%|▏         | 15/839 [00:10<09:51,  1.39it/s][A
Iteration:   2%|▏         | 16/839 [00:11<09:49,  1.40it/s][A
Iteration:   2%|▏         | 17/839 [00:12<09:49,  1.39it/s][A
Iteration:   2%|▏         | 18/839 [00:13<09:47,  1.40it/s][A
Iteration:   2%|▏         | 19/839 [00:13<09:46,  1.40it/s][A
Iteration:   2%|▏         | 20/839 [00:14<09:44,  1.40it/s][A
Iteration:   3%|▎         | 21/839 [00:15<09:42,  1.40it/s][A
Iteration:   3%|▎         | 22/839 [00:15<09:41,  1.40it/s][A
Iteration:   3%|▎         | 23/839 [00:16<09:41,  1.40it/s][A
Iteration:   3%|▎         | 24/839 [00:17<09:43,  1.40it/s][A
Iteration:   3%|▎         | 25/839 [00:18<09:40,  1.40it/s][A
Iteration:   3%|▎         | 26/839 [00:18<09:40,  1.40it/s][A
Iteration:   3%|▎         | 27/839 [00:19<09:41,  1.40it/s][A
Iteration:   3%|▎         | 28/839 [00:20<09:42,  1.39i

Iteration:  17%|█▋        | 139/839 [01:41<08:30,  1.37it/s][A
Iteration:  17%|█▋        | 140/839 [01:41<08:30,  1.37it/s][A
Iteration:  17%|█▋        | 141/839 [01:42<08:30,  1.37it/s][A
Iteration:  17%|█▋        | 142/839 [01:43<08:27,  1.37it/s][A
Iteration:  17%|█▋        | 143/839 [01:43<08:25,  1.38it/s][A
Iteration:  17%|█▋        | 144/839 [01:44<08:25,  1.38it/s][A
Iteration:  17%|█▋        | 145/839 [01:45<08:23,  1.38it/s][A
Iteration:  17%|█▋        | 146/839 [01:46<08:20,  1.38it/s][A
Iteration:  18%|█▊        | 147/839 [01:46<08:22,  1.38it/s][A
Iteration:  18%|█▊        | 148/839 [01:47<08:19,  1.38it/s][A
Iteration:  18%|█▊        | 149/839 [01:48<08:16,  1.39it/s][A
Iteration:  18%|█▊        | 150/839 [01:48<08:14,  1.39it/s][A
Iteration:  18%|█▊        | 151/839 [01:49<08:13,  1.39it/s][A
Iteration:  18%|█▊        | 152/839 [01:50<08:15,  1.39it/s][A
Iteration:  18%|█▊        | 153/839 [01:51<08:15,  1.38it/s][A
Iteration:  18%|█▊        | 154/839 [01:

save/
save/checkpoint-1000


01/07/2020 14:31:17 - INFO - src.transformers.modeling_utils -   Model weights saved in save/checkpoint-1000/pytorch_model.bin
01/07/2020 14:31:17 - INFO - __main__ -   Saving model checkpoint to save/checkpoint-1000

Iteration:  19%|█▉        | 161/839 [01:58<14:35,  1.29s/it][A
Iteration:  19%|█▉        | 162/839 [01:59<12:36,  1.12s/it][A
Iteration:  19%|█▉        | 163/839 [02:00<11:18,  1.00s/it][A
Iteration:  20%|█▉        | 164/839 [02:01<10:17,  1.09it/s][A
Iteration:  20%|█▉        | 165/839 [02:01<09:35,  1.17it/s][A
Iteration:  20%|█▉        | 166/839 [02:02<09:07,  1.23it/s][A
Iteration:  20%|█▉        | 167/839 [02:03<08:46,  1.28it/s][A
Iteration:  20%|██        | 168/839 [02:03<08:32,  1.31it/s][A
Iteration:  20%|██        | 169/839 [02:04<08:22,  1.33it/s][A
Iteration:  20%|██        | 170/839 [02:05<08:15,  1.35it/s][A
Iteration:  20%|██        | 171/839 [02:06<08:10,  1.36it/s][A
Iteration:  21%|██        | 172/839 [02:06<08:10,  1.36it/s][A
Iteration:  21

Iteration:  34%|███▎      | 282/839 [03:27<06:48,  1.36it/s][A
Iteration:  34%|███▎      | 283/839 [03:27<06:45,  1.37it/s][A
Iteration:  34%|███▍      | 284/839 [03:28<06:42,  1.38it/s][A
Iteration:  34%|███▍      | 285/839 [03:29<06:41,  1.38it/s][A
Iteration:  34%|███▍      | 286/839 [03:29<06:38,  1.39it/s][A
Iteration:  34%|███▍      | 287/839 [03:30<06:35,  1.39it/s][A
Iteration:  34%|███▍      | 288/839 [03:31<06:34,  1.40it/s][A
Iteration:  34%|███▍      | 289/839 [03:32<06:33,  1.40it/s][A
Iteration:  35%|███▍      | 290/839 [03:32<06:42,  1.36it/s][A
Iteration:  35%|███▍      | 291/839 [03:33<06:40,  1.37it/s][A
Iteration:  35%|███▍      | 292/839 [03:34<06:41,  1.36it/s][A
Iteration:  35%|███▍      | 293/839 [03:35<06:41,  1.36it/s][A
Iteration:  35%|███▌      | 294/839 [03:35<06:37,  1.37it/s][A
Iteration:  35%|███▌      | 295/839 [03:36<06:38,  1.36it/s][A
Iteration:  35%|███▌      | 296/839 [03:37<06:34,  1.38it/s][A
Iteration:  35%|███▌      | 297/839 [03:

Iteration:  49%|████▊     | 407/839 [04:57<05:18,  1.36it/s][A
Iteration:  49%|████▊     | 408/839 [04:58<05:17,  1.36it/s][A
Iteration:  49%|████▊     | 409/839 [04:59<05:13,  1.37it/s][A
Iteration:  49%|████▉     | 410/839 [04:59<05:10,  1.38it/s][A01/07/2020 14:34:19 - INFO - __main__ -   Average loss: 1.4465142679214478 at global step: 1250

Iteration:  49%|████▉     | 411/839 [05:00<05:08,  1.39it/s][A
Iteration:  49%|████▉     | 412/839 [05:01<05:06,  1.39it/s][A
Iteration:  49%|████▉     | 413/839 [05:02<05:06,  1.39it/s][A
Iteration:  49%|████▉     | 414/839 [05:02<05:06,  1.39it/s][A
Iteration:  49%|████▉     | 415/839 [05:03<05:04,  1.39it/s][A
Iteration:  50%|████▉     | 416/839 [05:04<05:02,  1.40it/s][A
Iteration:  50%|████▉     | 417/839 [05:04<05:03,  1.39it/s][A
Iteration:  50%|████▉     | 418/839 [05:05<05:02,  1.39it/s][A
Iteration:  50%|████▉     | 419/839 [05:06<04:59,  1.40it/s][A
Iteration:  50%|█████     | 420/839 [05:07<04:57,  1.41it/s][A
Iteratio

Iteration:  63%|██████▎   | 530/839 [06:27<03:43,  1.39it/s][A
Iteration:  63%|██████▎   | 531/839 [06:27<03:40,  1.40it/s][A
Iteration:  63%|██████▎   | 532/839 [06:28<03:39,  1.40it/s][A
Iteration:  64%|██████▎   | 533/839 [06:29<03:39,  1.39it/s][A
Iteration:  64%|██████▎   | 534/839 [06:29<03:39,  1.39it/s][A
Iteration:  64%|██████▍   | 535/839 [06:30<03:37,  1.40it/s][A
Iteration:  64%|██████▍   | 536/839 [06:31<03:37,  1.39it/s][A
Iteration:  64%|██████▍   | 537/839 [06:32<03:37,  1.39it/s][A
Iteration:  64%|██████▍   | 538/839 [06:32<03:37,  1.39it/s][A
Iteration:  64%|██████▍   | 539/839 [06:33<03:36,  1.39it/s][A
Iteration:  64%|██████▍   | 540/839 [06:34<03:35,  1.39it/s][A
Iteration:  64%|██████▍   | 541/839 [06:34<03:34,  1.39it/s][A
Iteration:  65%|██████▍   | 542/839 [06:35<03:35,  1.38it/s][A
Iteration:  65%|██████▍   | 543/839 [06:36<03:36,  1.37it/s][A
Iteration:  65%|██████▍   | 544/839 [06:37<03:37,  1.36it/s][A
Iteration:  65%|██████▍   | 545/839 [06:

Iteration:  78%|███████▊  | 655/839 [07:58<02:13,  1.38it/s][A
Iteration:  78%|███████▊  | 656/839 [07:58<02:13,  1.38it/s][A
Iteration:  78%|███████▊  | 657/839 [07:59<02:12,  1.38it/s][A
Iteration:  78%|███████▊  | 658/839 [08:00<02:11,  1.38it/s][A
Iteration:  79%|███████▊  | 659/839 [08:01<02:10,  1.38it/s][A
Iteration:  79%|███████▊  | 660/839 [08:01<02:11,  1.36it/s][A01/07/2020 14:37:21 - INFO - __main__ -   Average loss: 1.3854327201843262 at global step: 1500

Iteration:  79%|███████▉  | 661/839 [08:02<02:11,  1.35it/s][A
Iteration:  79%|███████▉  | 662/839 [08:03<02:09,  1.36it/s][A
Iteration:  79%|███████▉  | 663/839 [08:04<02:07,  1.38it/s][A
Iteration:  79%|███████▉  | 664/839 [08:04<02:06,  1.38it/s][A
Iteration:  79%|███████▉  | 665/839 [08:05<02:05,  1.38it/s][A
Iteration:  79%|███████▉  | 666/839 [08:06<02:04,  1.38it/s][A
Iteration:  79%|███████▉  | 667/839 [08:06<02:04,  1.38it/s][A
Iteration:  80%|███████▉  | 668/839 [08:07<02:05,  1.37it/s][A
Iteratio

Iteration:  93%|█████████▎| 778/839 [09:27<00:44,  1.37it/s][A
Iteration:  93%|█████████▎| 779/839 [09:28<00:43,  1.37it/s][A
Iteration:  93%|█████████▎| 780/839 [09:29<00:43,  1.37it/s][A
Iteration:  93%|█████████▎| 781/839 [09:29<00:42,  1.38it/s][A
Iteration:  93%|█████████▎| 782/839 [09:30<00:41,  1.38it/s][A
Iteration:  93%|█████████▎| 783/839 [09:31<00:41,  1.36it/s][A
Iteration:  93%|█████████▎| 784/839 [09:32<00:40,  1.37it/s][A
Iteration:  94%|█████████▎| 785/839 [09:32<00:39,  1.36it/s][A
Iteration:  94%|█████████▎| 786/839 [09:33<00:38,  1.37it/s][A
Iteration:  94%|█████████▍| 787/839 [09:34<00:37,  1.38it/s][A
Iteration:  94%|█████████▍| 788/839 [09:35<00:37,  1.38it/s][A
Iteration:  94%|█████████▍| 789/839 [09:35<00:36,  1.38it/s][A
Iteration:  94%|█████████▍| 790/839 [09:36<00:35,  1.39it/s][A
Iteration:  94%|█████████▍| 791/839 [09:37<00:34,  1.39it/s][A
Iteration:  94%|█████████▍| 792/839 [09:37<00:33,  1.39it/s][A
Iteration:  95%|█████████▍| 793/839 [09:

Iteration:   8%|▊         | 63/839 [00:45<09:18,  1.39it/s][A
Iteration:   8%|▊         | 64/839 [00:46<09:19,  1.38it/s][A
Iteration:   8%|▊         | 65/839 [00:47<09:21,  1.38it/s][A
Iteration:   8%|▊         | 66/839 [00:47<09:26,  1.36it/s][A
Iteration:   8%|▊         | 67/839 [00:48<09:22,  1.37it/s][A
Iteration:   8%|▊         | 68/839 [00:49<09:17,  1.38it/s][A
Iteration:   8%|▊         | 69/839 [00:49<09:14,  1.39it/s][A
Iteration:   8%|▊         | 70/839 [00:50<09:12,  1.39it/s][A
Iteration:   8%|▊         | 71/839 [00:51<09:11,  1.39it/s][A01/07/2020 14:40:22 - INFO - __main__ -   Average loss: 1.4366578674316406 at global step: 1750

Iteration:   9%|▊         | 72/839 [00:52<09:10,  1.39it/s][A
Iteration:   9%|▊         | 73/839 [00:52<09:11,  1.39it/s][A
Iteration:   9%|▉         | 74/839 [00:53<09:08,  1.40it/s][A
Iteration:   9%|▉         | 75/839 [00:54<09:07,  1.39it/s][A
Iteration:   9%|▉         | 76/839 [00:55<09:07,  1.39it/s][A
Iteration:   9%|▉     

Iteration:  22%|██▏       | 187/839 [02:16<07:51,  1.38it/s][A
Iteration:  22%|██▏       | 188/839 [02:16<07:53,  1.37it/s][A
Iteration:  23%|██▎       | 189/839 [02:17<07:52,  1.37it/s][A
Iteration:  23%|██▎       | 190/839 [02:18<07:58,  1.36it/s][A
Iteration:  23%|██▎       | 191/839 [02:19<07:54,  1.36it/s][A
Iteration:  23%|██▎       | 192/839 [02:19<07:52,  1.37it/s][A
Iteration:  23%|██▎       | 193/839 [02:20<07:51,  1.37it/s][A
Iteration:  23%|██▎       | 194/839 [02:21<07:47,  1.38it/s][A
Iteration:  23%|██▎       | 195/839 [02:21<07:45,  1.38it/s][A
Iteration:  23%|██▎       | 196/839 [02:22<07:42,  1.39it/s][A
Iteration:  23%|██▎       | 197/839 [02:23<07:44,  1.38it/s][A
Iteration:  24%|██▎       | 198/839 [02:24<07:42,  1.38it/s][A
Iteration:  24%|██▎       | 199/839 [02:24<07:49,  1.36it/s][A
Iteration:  24%|██▍       | 200/839 [02:25<07:46,  1.37it/s][A
Iteration:  24%|██▍       | 201/839 [02:26<07:42,  1.38it/s][A
Iteration:  24%|██▍       | 202/839 [02:

Iteration:  37%|███▋      | 312/839 [03:47<06:21,  1.38it/s][A
Iteration:  37%|███▋      | 313/839 [03:47<06:21,  1.38it/s][A
Iteration:  37%|███▋      | 314/839 [03:48<06:21,  1.38it/s][A
Iteration:  38%|███▊      | 315/839 [03:49<06:20,  1.38it/s][A
Iteration:  38%|███▊      | 316/839 [03:49<06:17,  1.38it/s][A
Iteration:  38%|███▊      | 317/839 [03:50<06:18,  1.38it/s][A
Iteration:  38%|███▊      | 318/839 [03:51<06:17,  1.38it/s][A
Iteration:  38%|███▊      | 319/839 [03:52<06:18,  1.37it/s][A
Iteration:  38%|███▊      | 320/839 [03:52<06:20,  1.36it/s][A
Iteration:  38%|███▊      | 321/839 [03:53<06:18,  1.37it/s][A01/07/2020 14:43:24 - INFO - __main__ -   Average loss: 1.392681860923767 at global step: 2000
01/07/2020 14:43:24 - INFO - src.transformers.configuration_utils -   Configuration saved in save/checkpoint-2000/config.json


save/
save/checkpoint-2000


01/07/2020 14:43:27 - INFO - src.transformers.modeling_utils -   Model weights saved in save/checkpoint-2000/pytorch_model.bin
01/07/2020 14:43:27 - INFO - __main__ -   Saving model checkpoint to save/checkpoint-2000

Iteration:  38%|███▊      | 322/839 [03:56<11:50,  1.37s/it][A
Iteration:  38%|███▊      | 323/839 [03:57<10:06,  1.18s/it][A
Iteration:  39%|███▊      | 324/839 [03:57<08:54,  1.04s/it][A
Iteration:  39%|███▊      | 325/839 [03:58<08:03,  1.06it/s][A
Iteration:  39%|███▉      | 326/839 [03:59<07:35,  1.13it/s][A
Iteration:  39%|███▉      | 327/839 [04:00<07:09,  1.19it/s][A
Iteration:  39%|███▉      | 328/839 [04:00<06:48,  1.25it/s][A
Iteration:  39%|███▉      | 329/839 [04:01<06:33,  1.29it/s][A
Iteration:  39%|███▉      | 330/839 [04:02<06:24,  1.32it/s][A
Iteration:  39%|███▉      | 331/839 [04:02<06:17,  1.35it/s][A
Iteration:  40%|███▉      | 332/839 [04:03<06:13,  1.36it/s][A
Iteration:  40%|███▉      | 333/839 [04:04<06:09,  1.37it/s][A
Iteration:  40

Iteration:  53%|█████▎    | 443/839 [05:24<04:47,  1.38it/s][A
Iteration:  53%|█████▎    | 444/839 [05:25<04:45,  1.38it/s][A
Iteration:  53%|█████▎    | 445/839 [05:25<04:51,  1.35it/s][A
Iteration:  53%|█████▎    | 446/839 [05:26<04:49,  1.36it/s][A
Iteration:  53%|█████▎    | 447/839 [05:27<04:51,  1.34it/s][A
Iteration:  53%|█████▎    | 448/839 [05:28<04:48,  1.35it/s][A
Iteration:  54%|█████▎    | 449/839 [05:28<04:48,  1.35it/s][A
Iteration:  54%|█████▎    | 450/839 [05:29<04:44,  1.37it/s][A
Iteration:  54%|█████▍    | 451/839 [05:30<04:42,  1.37it/s][A
Iteration:  54%|█████▍    | 452/839 [05:31<04:41,  1.38it/s][A
Iteration:  54%|█████▍    | 453/839 [05:31<04:39,  1.38it/s][A
Iteration:  54%|█████▍    | 454/839 [05:32<04:37,  1.39it/s][A
Iteration:  54%|█████▍    | 455/839 [05:33<04:35,  1.39it/s][A
Iteration:  54%|█████▍    | 456/839 [05:33<04:36,  1.39it/s][A
Iteration:  54%|█████▍    | 457/839 [05:34<04:36,  1.38it/s][A
Iteration:  55%|█████▍    | 458/839 [05:

Iteration:  68%|██████▊   | 568/839 [06:55<03:20,  1.35it/s][A
Iteration:  68%|██████▊   | 569/839 [06:56<03:18,  1.36it/s][A
Iteration:  68%|██████▊   | 570/839 [06:56<03:16,  1.37it/s][A
Iteration:  68%|██████▊   | 571/839 [06:57<03:14,  1.38it/s][A01/07/2020 14:46:29 - INFO - __main__ -   Average loss: 1.4209796690940857 at global step: 2250

Iteration:  68%|██████▊   | 572/839 [06:58<03:15,  1.37it/s][A
Iteration:  68%|██████▊   | 573/839 [06:59<03:13,  1.38it/s][A
Iteration:  68%|██████▊   | 574/839 [06:59<03:10,  1.39it/s][A
Iteration:  69%|██████▊   | 575/839 [07:00<03:09,  1.39it/s][A
Iteration:  69%|██████▊   | 576/839 [07:01<03:08,  1.40it/s][A
Iteration:  69%|██████▉   | 577/839 [07:01<03:10,  1.38it/s][A
Iteration:  69%|██████▉   | 578/839 [07:02<03:10,  1.37it/s][A
Iteration:  69%|██████▉   | 579/839 [07:03<03:09,  1.37it/s][A
Iteration:  69%|██████▉   | 580/839 [07:04<03:08,  1.37it/s][A
Iteration:  69%|██████▉   | 581/839 [07:04<03:09,  1.36it/s][A
Iteratio

Iteration:  82%|████████▏ | 691/839 [08:24<01:46,  1.39it/s][A
Iteration:  82%|████████▏ | 692/839 [08:25<01:46,  1.39it/s][A
Iteration:  83%|████████▎ | 693/839 [08:26<01:46,  1.37it/s][A
Iteration:  83%|████████▎ | 694/839 [08:26<01:45,  1.37it/s][A
Iteration:  83%|████████▎ | 695/839 [08:27<01:44,  1.38it/s][A
Iteration:  83%|████████▎ | 696/839 [08:28<01:43,  1.39it/s][A
Iteration:  83%|████████▎ | 697/839 [08:29<01:42,  1.38it/s][A
Iteration:  83%|████████▎ | 698/839 [08:29<01:42,  1.38it/s][A
Iteration:  83%|████████▎ | 699/839 [08:30<01:41,  1.37it/s][A
Iteration:  83%|████████▎ | 700/839 [08:31<01:41,  1.37it/s][A
Iteration:  84%|████████▎ | 701/839 [08:31<01:41,  1.36it/s][A
Iteration:  84%|████████▎ | 702/839 [08:32<01:42,  1.34it/s][A
Iteration:  84%|████████▍ | 703/839 [08:33<01:41,  1.34it/s][A
Iteration:  84%|████████▍ | 704/839 [08:34<01:40,  1.34it/s][A
Iteration:  84%|████████▍ | 705/839 [08:34<01:39,  1.35it/s][A
Iteration:  84%|████████▍ | 706/839 [08:

Iteration:  97%|█████████▋| 816/839 [09:55<00:17,  1.33it/s][A
Iteration:  97%|█████████▋| 817/839 [09:56<00:16,  1.34it/s][A
Iteration:  97%|█████████▋| 818/839 [09:57<00:15,  1.34it/s][A
Iteration:  98%|█████████▊| 819/839 [09:57<00:15,  1.33it/s][A
Iteration:  98%|█████████▊| 820/839 [09:58<00:14,  1.35it/s][A
Iteration:  98%|█████████▊| 821/839 [09:59<00:13,  1.37it/s][A01/07/2020 14:49:30 - INFO - __main__ -   Average loss: 1.4241965866088868 at global step: 2500

Iteration:  98%|█████████▊| 822/839 [10:00<00:12,  1.38it/s][A
Iteration:  98%|█████████▊| 823/839 [10:00<00:11,  1.39it/s][A
Iteration:  98%|█████████▊| 824/839 [10:01<00:10,  1.39it/s][A
Iteration:  98%|█████████▊| 825/839 [10:02<00:10,  1.39it/s][A
Iteration:  98%|█████████▊| 826/839 [10:02<00:09,  1.39it/s][A
Iteration:  99%|█████████▊| 827/839 [10:03<00:08,  1.39it/s][A
Iteration:  99%|█████████▊| 828/839 [10:04<00:07,  1.39it/s][A
Iteration:  99%|█████████▉| 829/839 [10:05<00:07,  1.40it/s][A
Iteratio

In [88]:
results = {}
if do_eval and local_rank in [-1, 0]:
    if not do_train:
        output_dir = model_name_or_path
    checkpoints = [output_dir]
    if eval_all_checkpoints:
        checkpoints = list(
            os.path.dirname(c) for c in sorted(glob.glob(output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
        )
        logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
    logger.info("Evaluate the following checkpoints: %s", checkpoints)
    for checkpoint in checkpoints:
        global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
        prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

        model = model_class.from_pretrained(checkpoint)
        model.to(device)
        result = evaluate(args, model, tokenizer, prefix=prefix)
        result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
        results.update(result)

01/07/2020 14:50:53 - INFO - __main__ -   Evaluate the following checkpoints: ['save/checkpoint-1000', 'save/checkpoint-2000']
01/07/2020 14:50:53 - INFO - src.transformers.configuration_utils -   loading configuration file save/checkpoint-1000/config.json
01/07/2020 14:50:53 - INFO - src.transformers.configuration_utils -   Model config {
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 0,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "do_sample": false,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_ids": 0,
  "ff_activation": "gelu",
  "finetuning_task": "arc",
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "mem_len": null,
  "n_head": 12,
  "n_layer": 12,
  "num_beams": 1,
  "num_labels": 4,
  "num_return_sequences": 1,
  "output_attentions": false,
 

AttributeError: 'NoneType' object has no attribute 'kg_embeddings'

In [89]:
checkpoints = [output_dir]

In [91]:
checkpoints = list(
            os.path.dirname(c) for c in sorted(glob.glob(output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
        )

In [92]:
checkpoints

['save/checkpoint-1000', 'save/checkpoint-2000']

In [93]:
checkpoint = checkpoints[0]

In [95]:
global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

In [97]:
model = model_class.from_pretrained(checkpoint)

01/07/2020 15:26:34 - INFO - src.transformers.configuration_utils -   loading configuration file save/checkpoint-1000/config.json
01/07/2020 15:26:34 - INFO - src.transformers.configuration_utils -   Model config {
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 0,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "do_sample": false,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_ids": 0,
  "ff_activation": "gelu",
  "finetuning_task": "arc",
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "mem_len": null,
  "n_head": 12,
  "n_layer": 12,
  "num_beams": 1,
  "num_labels": 4,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "repetition_penalty": 1.0,


AttributeError: 'NoneType' object has no attribute 'kg_embeddings'

In [98]:
model_class

src.transformers.modeling_with_arc_xlnet.XLNetForMultipleChoice