In [1]:
"""
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
using a masked language modeling (MLM) loss.
"""

from __future__ import absolute_import, division, print_function

import argparse
import glob
import logging
import os
import pickle
import random
import re
import shutil

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from torch.utils.data.distributed import DistributedSampler

try:
    from torch.utils.tensorboard import SummaryWriter
except:
    from tensorboardX import SummaryWriter

from tqdm import tqdm, trange

from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                                  BertConfig, BertForMaskedLM, BertTokenizer,
                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)

In [2]:
logger = logging.getLogger(__name__)


MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
}

In [3]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, args, file_path='train', block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(directory, args['model_name_or_path'] + '_cached_lm_' + str(block_size) + '_' + filename)

        if os.path.exists(cached_features_file):
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, 'rb') as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            
            df = pd.read_pickle("../data/train.pkl")
            
            genre_tok = dict()
            for genre in df.genres.unique():
                g = genre.replace(",", " ").replace("Sci-Fi", "Science Fiction") + "~"
                genre_tok[genre] = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(g))

            for script, genre in zip(df.script, df.genres):
                tokenized_genre = genre_tok[genre]
                tokenized_script = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(script))

                temp_block_size = block_size - len(tokenized_genre)

                for i in range(0, len(tokenized_script)-temp_block_size+1, temp_block_size):
                    self.examples.append(tokenized_genre+tokenized_script[i:i+temp_block_size])

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, 'wb') as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item])

In [4]:
def load_and_cache_examples(args, tokenizer, evaluate=False):
    dataset = TextDataset(tokenizer, args, file_path=args['eval_data_file'] if evaluate else args['train_data_file'], block_size=args['block_size'])
    return dataset


def set_seed(args):
    random.seed(args['seed'])
    np.random.seed(args['seed'])
    torch.manual_seed(args['seed'])
    if args['n_gpu'] > 0:
        torch.cuda.manual_seed_all(args['seed'])


def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
    if not args['save_total_limit']:
        return
    if args['save_total_limit'] <= 0:
        return

    # Check if we should delete older checkpoint(s)
    glob_checkpoints = glob.glob(os.path.join(args['output_dir'], '{}-*'.format(checkpoint_prefix)))
    if len(glob_checkpoints) <= args['save_total_limit']:
        return

    ordering_and_checkpoint_path = []
    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args['save_total_limit'])
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)


def mask_tokens(inputs, tokenizer, args):
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args['mlm_probability'])
    special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -1  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

In [5]:
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args['local_rank'] in [-1, 0]:
        tb_writer = SummaryWriter()

    args['train_batch_size'] = args['per_gpu_train_batch_size'] * max(1, args['n_gpu'])
    train_sampler = RandomSampler(train_dataset) if args['local_rank'] == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size'])

    if args['max_steps'] > 0:
        t_total = args['max_steps']
        args['num_train_epochs'] = args['max_steps'] // (len(train_dataloader) // args['gradient_accumulation_steps']) + 1
    else:
        t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs']

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'], num_training_steps=t_total)
    if args['fp16']:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args['fp16_opt_level'])

    # multi-gpu training (should be after apex fp16 initialization)
    if args['n_gpu'] > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args['local_rank'] != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args['local_rank']],
                                                          output_device=args['local_rank'],
                                                          find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args['num_train_epochs'])
    logger.info("  Instantaneous batch size per GPU = %d", args['per_gpu_train_batch_size'])
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                   args['train_batch_size'] * args['gradient_accumulation_steps'] * (torch.distributed.get_world_size() if args['local_rank'] != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args['num_train_epochs']), desc="Epoch", disable=args['local_rank'] not in [-1, 0])
    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args['local_rank'] not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = mask_tokens(batch, tokenizer, args) if args['mlm'] else (batch, batch)
            inputs = inputs.to(args['device'])
            labels = labels.to(args['device'])
            model.train()
            outputs = model(inputs, masked_lm_labels=labels) if args['mlm'] else model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args['n_gpu'] > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']

            if args['fp16']:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args['gradient_accumulation_steps'] == 0:
                if args['fp16']:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args['max_grad_norm'])
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args['local_rank'] in [-1, 0] and args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
                    # Log metrics
                    if args['local_rank'] == -1 and args['evaluate_during_training']:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args['logging_steps'], global_step)
                    logging_loss = tr_loss

                if args['local_rank'] in [-1, 0] and args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
                    checkpoint_prefix = 'checkpoint'
                    # Save model checkpoint
                    output_dir = os.path.join(args['output_dir'], '{}-{}'.format(checkpoint_prefix, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

            if args['max_steps'] > 0 and global_step > args['max_steps']:
                epoch_iterator.close()
                break
        if args['max_steps'] > 0 and global_step > args['max_steps']:
            train_iterator.close()
            break

    if args['local_rank'] in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

In [6]:
def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {
        "perplexity": perplexity
    }

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

In [7]:
parser = argparse.ArgumentParser()
args = {
    "train_data_file": "../data/train.pkl",
    "output_dir": "tempoutput",
    "eval_data_file": "../data/test.pkl",
    "model_type": "gpt2",
    "model_name_or_path": "gpt2",

    "mlm":False,
    "mlm_probability":0.15,
    "config_name":"",
    "tokenizer_name":"",
    "cache_dir":"",
    "block_size":-1,
    "do_train":True,
    "do_eval":False,
    "evaluate_during_training":False,
    "do_lower_case":False,
    
    "per_gpu_train_batch_size":1,
    "per_gpu_eval_batch_size":4,
    "gradient_accumulation_steps":1,
    "learning_rate":5e-5,
    "weight_decay":0.0,
    "adam_epsilon":1e-8,
    "max_grad_norm":1,
    "num_train_epochs":1.0,
    "max_steps":-1,
    "warmup_steps":-1,
    
    "logging_steps":50,
    "save_steps":50,
    "save_total_limit":None,
    "eval_all_checkpoints":True,
    "no_cuda":False,
    "overwrite_output_dir":True,
    "overwrite_cache":False,
    "seed":42,
    
    "fp16":True,
    "fp16_opt_level":"O3",
    "local_rank":-1,
    "server_ip":"",
    "server_port":""
}

In [8]:
if args['model_type'] in ["bert", "roberta", "distilbert"] and not args['mlm']:
    raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
                     "flag (masked language modeling).")
if args['eval_data_file'] is None and args['do_eval']:
    raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
                     "or remove the --do_eval argument.")

if os.path.exists(args['output_dir']) and os.listdir(args['output_dir']) and args['do_train'] and not args['overwrite_output_dir']:
    raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args['output_dir']))

# Setup distant debugging if needed
if args['server_ip'] and args['server_port']:
    # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
    import ptvsd
    print("Waiting for debugger attach")
    ptvsd.enable_attach(address=(args['server_ip'], args['server_port']), redirect_output=True)
    ptvsd.wait_for_attach()

# Setup CUDA, GPU & distributed training
if args['local_rank'] == -1 or args['no_cuda']:
    device = torch.device("cuda" if torch.cuda.is_available() and not args['no_cuda'] else "cpu")
    args['n_gpu'] = torch.cuda.device_count()
else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args['local_rank'])
    device = torch.device("cuda", args['local_rank'])
    torch.distributed.init_process_group(backend='nccl')
    args['n_gpu'] = 1
args['device'] = device

# Setup logging
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO if args['local_rank'] in [-1, 0] else logging.WARN)
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                args['local_rank'], device, args['n_gpu'], bool(args['local_rank'] != -1), args['fp16'])

# Set seed
set_seed(args)



In [9]:
# Load pretrained model and tokenizer
if args['local_rank'] not in [-1, 0]:
    torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab

config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]
config = config_class.from_pretrained(args['config_name'] if args['config_name'] else args['model_name_or_path'],
                                      cache_dir=args['cache_dir'] if args['cache_dir'] else None)
tokenizer = tokenizer_class.from_pretrained(args['tokenizer_name'] if args['tokenizer_name'] else args['model_name_or_path'],
                                            do_lower_case=args['do_lower_case'],
                                            cache_dir=args['cache_dir'] if args['cache_dir'] else None)
if args['block_size'] <= 0:
    args['block_size'] = tokenizer.max_len_single_sentence  # Our input block size will be the max possible for the model
args['block_size'] = min(args['block_size'], tokenizer.max_len_single_sentence)
model = model_class.from_pretrained(args['model_name_or_path'],
                                    from_tf=bool('.ckpt' in args['model_name_or_path']),
                                    config=config,
                                    cache_dir=args['cache_dir'] if args['cache_dir'] else None)
model.to(args['device']);

12/03/2019 21:47:37 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /tmp/xdg-cache/torch/transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.085d5f6a8e7812ea05ff0e6ed0645ab2e75d80387ad55c1ad9806ee70d272f80
12/03/2019 21:47:37 - INFO - transformers.configuration_utils -   Model config {
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "initializer_range": 0.02,
  "is_decoder": false,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torchscript": false,
  "use_bf

In [10]:
if args['local_rank'] == 0:
    torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab

logger.info("Training/evaluation parameters %s", args)

# Training
if args['do_train']:
    if args['local_rank'] not in [-1, 0]:
        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache

    train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)

12/03/2019 21:47:47 - INFO - __main__ -   Training/evaluation parameters {'train_data_file': '../data/train.pkl', 'output_dir': 'tempoutput', 'eval_data_file': '../data/test.pkl', 'model_type': 'gpt2', 'model_name_or_path': 'gpt2', 'mlm': False, 'mlm_probability': 0.15, 'config_name': '', 'tokenizer_name': '', 'cache_dir': '', 'block_size': 1024, 'do_train': True, 'do_eval': False, 'evaluate_during_training': False, 'do_lower_case': False, 'per_gpu_train_batch_size': 1, 'per_gpu_eval_batch_size': 4, 'gradient_accumulation_steps': 1, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 1.0, 'max_steps': -1, 'warmup_steps': -1, 'logging_steps': 50, 'save_steps': 50, 'save_total_limit': None, 'eval_all_checkpoints': True, 'no_cuda': False, 'overwrite_output_dir': True, 'overwrite_cache': False, 'seed': 42, 'fp16': True, 'fp16_opt_level': 'O3', 'local_rank': -1, 'server_ip': '', 'server_port': '', 'n_gpu': 1, 'device': device(type='cud

In [11]:
if args['local_rank'] == 0:
    torch.distributed.barrier()

global_step, tr_loss = train(args, train_dataset, model, tokenizer)
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

12/03/2019 21:38:06 - INFO - __main__ -   ***** Running training *****
12/03/2019 21:38:06 - INFO - __main__ -     Num examples = 116642
12/03/2019 21:38:06 - INFO - __main__ -     Num Epochs = 1
12/03/2019 21:38:06 - INFO - __main__ -     Instantaneous batch size per GPU = 1
12/03/2019 21:38:06 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 1
12/03/2019 21:38:06 - INFO - __main__ -     Gradient Accumulation steps = 1
12/03/2019 21:38:06 - INFO - __main__ -     Total optimization steps = 116642
Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/116642 [00:00<?, ?it/s][A

Selected optimization level O3:  Pure FP16 training.
Defaults for this optimization level are:
enabled                : True
opt_level              : O3
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : False
master_weights         : False
loss_scale             : 1.0
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O3
cast_model_type        : torch.float16
patch_torch_functions  : False
keep_batchnorm_fp32    : False
master_weights         : False
loss_scale             : 1.0



Iteration:   0%|          | 1/116642 [00:00<21:38:51,  1.50it/s][A
Iteration:   0%|          | 2/116642 [00:00<17:14:54,  1.88it/s][A
Iteration:   0%|          | 3/116642 [00:01<13:52:28,  2.34it/s][A
Iteration:   0%|          | 4/116642 [00:01<11:31:25,  2.81it/s][A
Iteration:   0%|          | 5/116642 [00:01<9:55:21,  3.27it/s] [A
Iteration:   0%|          | 6/116642 [00:01<8:43:07,  3.72it/s][A
Iteration:   0%|          | 7/116642 [00:01<8:00:46,  4.04it/s][A
Iteration:   0%|          | 8/116642 [00:02<7:27:38,  4.34it/s][A
Iteration:   0%|          | 9/116642 [00:02<7:01:55,  4.61it/s][A
Iteration:   0%|          | 10/116642 [00:02<6:48:14,  4.76it/s][A
Iteration:   0%|          | 11/116642 [00:02<6:40:15,  4.86it/s][A
Iteration:   0%|          | 12/116642 [00:02<6:31:31,  4.96it/s][A
Iteration:   0%|          | 13/116642 [00:02<6:27:16,  5.02it/s][A
Iteration:   0%|          | 14/116642 [00:03<6:25:28,  5.04it/s][A
Iteration:   0%|          | 15/116642 [00:03<6:25:1

KeyboardInterrupt: 

In [None]:
# Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
if args['do_train'] and (args['local_rank'] == -1 or torch.distributed.get_rank() == 0):
    # Create output directory if needed
    if not os.path.exists(args['output_dir']) and args['local_rank'] in [-1, 0]:
        os.makedirs(args['output_dir'])

    logger.info("Saving model checkpoint to %s", args['output_dir'])
    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(args['output_dir'])
    tokenizer.save_pretrained(args['output_dir'])

    # Good practice: save your training arguments together with the trained model
    torch.save(args, os.path.join(args['output_dir'], 'training_args.bin'))

    # Load a trained model and vocabulary that you have fine-tuned
    model = model_class.from_pretrained(args['output_dir'])
    tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
    model.to(args['device'])


# Evaluation
results = {}
if args['do_eval'] and args['local_rank'] in [-1, 0]:
    checkpoints = [args['output_dir']]
    if args['eval_all_checkpoints']:
        checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + '/**/' + WEIGHTS_NAME, recursive=True)))
        logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
    logger.info("Evaluate the following checkpoints: %s", checkpoints)
    for checkpoint in checkpoints:
        global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
        prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""

        model = model_class.from_pretrained(checkpoint)
        model.to(args['device'])
        result = evaluate(args, model, tokenizer, prefix=prefix)
        result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
        results.update(result)