In [66]:
from torchtext import data, datasets
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from pytorch_transformers import AdamW, WarmupLinearSchedule
from seqeval.metrics import classification_report
from tqdm import tqdm, trange
import random
import numpy as np

In [67]:
class SequenceTaggingDataset(data.Dataset):
    @staticmethod
    def sort_key(example):
        for attr in dir(example):
            if not callable(getattr(example, attr)) and \
                    not attr.startswith("__"):
                return len(getattr(example, attr))
        return 0

    def __init__(self, path, fields, encoding="utf-8", separator="\t", **kwargs):
        examples = []
        columns = []

        with open(path, encoding=encoding) as input_file:
            for line in input_file:
                line = line.strip()
                if line.startswith('-DOCSTART-'):
                    continue
                if line == "":
                    if columns:
                        examples.append(data.Example.fromlist(columns, fields))
                    columns = []
                else:
                    for i, column in enumerate(line.split(separator)):
                        if len(columns) < i + 1:
                            columns.append([])
                        columns[i].append(column)

            if columns:
                examples.append(data.Example.fromlist(columns, fields))
        super(SequenceTaggingDataset, self).__init__(examples, fields,
                                                     **kwargs)

In [68]:
WORD = data.Field()
POS1 = data.Field()
POS2 = data.Field()
LABEL = data.Field()
train_ds, valid_ds, test_ds = SequenceTaggingDataset.splits(fields=[('word', WORD), 
                                                                    ('pos1', POS1), 
                                                                    ('pos2', POS2), 
                                                                    ('label', LABEL)],
                                                            path='CoNLL-2003' ,
                                                            separator=" ",
                                                            train="eng.train", 
                                                            validation="eng.testa", 
                                                            test="eng.testb"
                                                           )

In [69]:
import logging
logger = logging.getLogger(__name__)

In [70]:
LABEL.build_vocab(train_ds, valid_ds, test_ds)

In [71]:
label_list = list(LABEL.vocab.freqs)

In [72]:
from pytorch_transformers import BertConfig, BertTokenizer, BertForTokenClassification
model_name = 'bert-base-uncased'
config = BertConfig.from_pretrained(model_name, num_labels=len(label_list)+3)
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
model = BertForTokenClassification.from_pretrained(model_name, config=config)

In [73]:
tokenizer.tokenize('i have a unaffable 1999-08')

['i', 'have', 'a', 'una', '##ffa', '##ble', '1999', '-', '08']

In [74]:
def convert_examples_to_features(examples, label_list, label_map,
                                 max_seq_length,
                                 tokenizer, 
                                 cls_token_at_end=False,
                                 cls_token='[CLS]',
                                 cls_token_segment_id=1,
                                 sep_token='[SEP]',
                                 sep_token_extra=False,
                                 pad_on_left=False,
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 sequence_a_segment_id=0, 
                                 sequence_b_segment_id=1,
                                 mask_padding_with_zero=True):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """
    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

#         tokens_a = tokenizer.tokenize(example.text_a)
        tokens_a = []
        labels = []
        for word, label in zip(example.word, example.label):
            _tokens = tokenizer.tokenize(word)
            for i, _token in enumerate(_tokens):
                tokens_a.append(_token)
                if i == 0:
                    labels.append(label)
                else:
                    labels.append("X")
        if len(tokens_a) >= max_seq_length - 1:
            tokens_a = tokens_a[0:(max_seq_length - 1)]
            labels = labels[0:(max_seq_length - 1)]
        
        tokens = tokens_a
#         tokens_b = None
#         if example.text_b:
#             tokens_b = tokenizer.tokenize(example.text_b)
#             # Modifies `tokens_a` and `tokens_b` in place so that the total
#             # length is less than the specified length.
#             # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
#             special_tokens_count = 4 if sep_token_extra else 3
#             _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
#         else:
#             # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
#             special_tokens_count = 3 if sep_token_extra else 2
#             if len(tokens_a) > max_seq_length - special_tokens_count:
#                 tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]

        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
#         special_tokens_count = 3 if sep_token_extra else 2
#         if len(tokens_a) > max_seq_length - special_tokens_count:
#             tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
#         tokens = tokens_a + [sep_token]
        # add (labels mut be padded and masked)
#         labels = labels + [sep_token]  # add
#         if sep_token_extra:
#             # roberta uses an extra separator b/w pairs of sentences
#             tokens += [sep_token]
        segment_ids = [sequence_a_segment_id] * len(tokens)

#         if tokens_b:
#             tokens += tokens_b + [sep_token]
#             segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)

        if cls_token_at_end:
            tokens = tokens + [cls_token]
            labels = labels + [cls_token]  # add
            segment_ids = segment_ids + [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            labels = [cls_token] + labels  # add
            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        label_ids = [label_map[label] for label in labels]  # add

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            label_ids = ([pad_token] * padding_length) + label_ids # add
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            label_ids = label_ids + ([pad_token] * padding_length)  # add
            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
            
        assert len(input_ids) == max_seq_length
        assert len(label_ids) == max_seq_length # add
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

#         if output_mode == "classification":
#             label_id = label_map[example.label]
#         elif output_mode == "regression":
#             label_id = float(example.label)
#         else:
#             raise KeyError(output_mode)
    
        if ex_index < 5:
            logger.info("*** Example ***")
#             logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s " % " ".join([str(x) for x in label_ids]))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_ids=label_ids))
    return features

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids

In [75]:
cls_token='[CLS]'
label_list = label_list + [cls_token] + ['X']
label_map = {label : i + 1 for i, label in enumerate(label_list)}  # id 0 is for [pad]

In [76]:
max_seq_length = 128
cls_token_at_end = False
features = convert_examples_to_features(train_ds, label_list, label_map,
                                        max_seq_length,
                                        tokenizer,
                                        cls_token_at_end = False,
                                        cls_token=tokenizer.cls_token,
                                        cls_token_segment_id=0,
                                        sep_token=tokenizer.sep_token,
                                        sep_token_extra=False,
                                        pad_on_left=False,
                                        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                        pad_token_segment_id=0,
                                       )

In [77]:
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

In [78]:
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

class Args(object):
    def __init__(self):
        self.adam_epsilon = 1e-8
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.local_rank = -1
        self.max_steps = -1
        self.max_grad_norm = 1.0
        self.model_type = 'bert'
        self.num_train_epochs = 3.0
        self.per_gpu_train_batch_size = 8
        self.per_gpu_eval_batch_size = 8
        self.seed = 42
        self.warmup_steps = 0
        self.weight_decay = 0.0
        self.fp16 = False
        
args = Args()
args.n_gpu = torch.cuda.device_count()
args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [79]:
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
#     if args.local_rank in [-1, 0]:
#         tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
#     train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

#                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
#                     # Log metrics
#                     if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
#                         results = evaluate(args, model, tokenizer)
#                         for key, value in results.items():
#                             tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
#                     tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
#                     tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
#                     logging_loss = tr_loss
                logging_loss = tr_loss
                
#                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
#                     # Save model checkpoint
#                     output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
#                     if not os.path.exists(output_dir):
#                         os.makedirs(output_dir)
#                     model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
#                     model_to_save.save_pretrained(output_dir)
#                     torch.save(args, os.path.join(output_dir, 'training_args.bin'))
#                     logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

#     if args.local_rank in [-1, 0]:
#         tb_writer.close()

    return global_step, tr_loss / global_step

In [80]:
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                args.local_rank, args.device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
model.to(args.device)
global_step, tr_loss = train(args, dataset, model, tokenizer)

Process rank: -1, device: cuda, n_gpu: 3, distributed training: False, 16-bits training: False
Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 1/586 [00:00<03:07,  3.11it/s][A
Iteration:   0%|          | 2/586 [00:00<03:08,  3.10it/s][A
Iteration:   1%|          | 3/586 [00:00<03:07,  3.10it/s][A
Iteration:   1%|          | 4/586 [00:01<03:07,  3.11it/s][A
Iteration:   1%|          | 5/586 [00:01<03:06,  3.11it/s][A
Iteration:   1%|          | 6/586 [00:01<03:07,  3.10it/s][A
Iteration:   1%|          | 7/586 [00:02<03:06,  3.11it/s][A
Iteration:   1%|▏         | 8/586 [00:02<03:05,  3.11it/s][A
Iteration:   2%|▏         | 9/586 [00:02<03:05,  3.12it/s][A
Iteration:   2%|▏         | 10/586 [00:03<03:04,  3.12it/s][A
Iteration:   2%|▏         | 11/586 [00:03<03:04,  3.12it/s][A
Iteration:   2%|▏         | 12/586 [00:03<03:03,  3.12it/s][A
Iteration:   2%|▏         | 13/586 [00:04<03:03,  3.12it/s][A
Iteration:   2%|▏         | 14/586 [00:04<03:03, 

Iteration:  43%|████▎     | 251/586 [01:20<01:47,  3.11it/s][A
Iteration:  43%|████▎     | 252/586 [01:21<01:47,  3.11it/s][A
Iteration:  43%|████▎     | 253/586 [01:21<01:47,  3.11it/s][A
Iteration:  43%|████▎     | 254/586 [01:21<01:46,  3.11it/s][A
Iteration:  44%|████▎     | 255/586 [01:22<01:46,  3.11it/s][A
Iteration:  44%|████▎     | 256/586 [01:22<01:46,  3.10it/s][A
Iteration:  44%|████▍     | 257/586 [01:22<01:46,  3.10it/s][A
Iteration:  44%|████▍     | 258/586 [01:23<01:45,  3.10it/s][A
Iteration:  44%|████▍     | 259/586 [01:23<01:45,  3.10it/s][A
Iteration:  44%|████▍     | 260/586 [01:23<01:44,  3.11it/s][A
Iteration:  45%|████▍     | 261/586 [01:24<01:44,  3.10it/s][A
Iteration:  45%|████▍     | 262/586 [01:24<01:44,  3.09it/s][A
Iteration:  45%|████▍     | 263/586 [01:24<01:44,  3.09it/s][A
Iteration:  45%|████▌     | 264/586 [01:24<01:44,  3.10it/s][A
Iteration:  45%|████▌     | 265/586 [01:25<01:43,  3.10it/s][A
Iteration:  45%|████▌     | 266/586 [01:

Iteration:  87%|████████▋ | 507/586 [02:43<00:26,  3.04it/s][A
Iteration:  87%|████████▋ | 508/586 [02:44<00:25,  3.05it/s][A
Iteration:  87%|████████▋ | 509/586 [02:44<00:25,  3.06it/s][A
Iteration:  87%|████████▋ | 510/586 [02:44<00:24,  3.07it/s][A
Iteration:  87%|████████▋ | 511/586 [02:45<00:24,  3.07it/s][A
Iteration:  87%|████████▋ | 512/586 [02:45<00:24,  3.06it/s][A
Iteration:  88%|████████▊ | 513/586 [02:45<00:23,  3.08it/s][A
Iteration:  88%|████████▊ | 514/586 [02:46<00:23,  3.09it/s][A
Iteration:  88%|████████▊ | 515/586 [02:46<00:22,  3.09it/s][A
Iteration:  88%|████████▊ | 516/586 [02:46<00:22,  3.10it/s][A
Iteration:  88%|████████▊ | 517/586 [02:47<00:22,  3.08it/s][A
Iteration:  88%|████████▊ | 518/586 [02:47<00:22,  3.09it/s][A
Iteration:  89%|████████▊ | 519/586 [02:47<00:21,  3.08it/s][A
Iteration:  89%|████████▊ | 520/586 [02:48<00:21,  3.09it/s][A
Iteration:  89%|████████▉ | 521/586 [02:48<00:20,  3.10it/s][A
Iteration:  89%|████████▉ | 522/586 [02:

Iteration:  30%|███       | 176/586 [00:57<02:12,  3.09it/s][A
Iteration:  30%|███       | 177/586 [00:57<02:11,  3.11it/s][A
Iteration:  30%|███       | 178/586 [00:58<02:11,  3.10it/s][A
Iteration:  31%|███       | 179/586 [00:58<02:11,  3.10it/s][A
Iteration:  31%|███       | 180/586 [00:58<02:11,  3.08it/s][A
Iteration:  31%|███       | 181/586 [00:59<02:11,  3.08it/s][A
Iteration:  31%|███       | 182/586 [00:59<02:10,  3.10it/s][A
Iteration:  31%|███       | 183/586 [00:59<02:10,  3.09it/s][A
Iteration:  31%|███▏      | 184/586 [01:00<02:10,  3.07it/s][A
Iteration:  32%|███▏      | 185/586 [01:00<02:11,  3.06it/s][A
Iteration:  32%|███▏      | 186/586 [01:00<02:11,  3.05it/s][A
Iteration:  32%|███▏      | 187/586 [01:01<02:10,  3.05it/s][A
Iteration:  32%|███▏      | 188/586 [01:01<02:10,  3.06it/s][A
Iteration:  32%|███▏      | 189/586 [01:01<02:08,  3.08it/s][A
Iteration:  32%|███▏      | 190/586 [01:02<02:08,  3.09it/s][A
Iteration:  33%|███▎      | 191/586 [01:

Iteration:  74%|███████▎  | 432/586 [02:21<00:50,  3.04it/s][A
Iteration:  74%|███████▍  | 433/586 [02:22<00:50,  3.05it/s][A
Iteration:  74%|███████▍  | 434/586 [02:22<00:49,  3.05it/s][A
Iteration:  74%|███████▍  | 435/586 [02:22<00:49,  3.06it/s][A
Iteration:  74%|███████▍  | 436/586 [02:23<00:48,  3.08it/s][A
Iteration:  75%|███████▍  | 437/586 [02:23<00:48,  3.06it/s][A
Iteration:  75%|███████▍  | 438/586 [02:23<00:48,  3.05it/s][A
Iteration:  75%|███████▍  | 439/586 [02:24<00:48,  3.04it/s][A
Iteration:  75%|███████▌  | 440/586 [02:24<00:51,  2.81it/s][A
Iteration:  75%|███████▌  | 441/586 [02:24<00:50,  2.88it/s][A
Iteration:  75%|███████▌  | 442/586 [02:25<00:49,  2.91it/s][A
Iteration:  76%|███████▌  | 443/586 [02:25<00:48,  2.93it/s][A
Iteration:  76%|███████▌  | 444/586 [02:25<00:47,  2.98it/s][A
Iteration:  76%|███████▌  | 445/586 [02:26<00:46,  3.01it/s][A
Iteration:  76%|███████▌  | 446/586 [02:26<00:46,  3.02it/s][A
Iteration:  76%|███████▋  | 447/586 [02:

Iteration:  17%|█▋        | 102/586 [00:33<02:39,  3.04it/s][A
Iteration:  18%|█▊        | 103/586 [00:33<02:38,  3.05it/s][A
Iteration:  18%|█▊        | 104/586 [00:34<02:38,  3.05it/s][A
Iteration:  18%|█▊        | 105/586 [00:34<02:36,  3.07it/s][A
Iteration:  18%|█▊        | 106/586 [00:34<02:36,  3.08it/s][A
Iteration:  18%|█▊        | 107/586 [00:35<02:45,  2.89it/s][A
Iteration:  18%|█▊        | 108/586 [00:35<02:42,  2.94it/s][A
Iteration:  19%|█▊        | 109/586 [00:35<02:39,  2.99it/s][A
Iteration:  19%|█▉        | 110/586 [00:36<02:37,  3.01it/s][A
Iteration:  19%|█▉        | 111/586 [00:36<02:37,  3.01it/s][A
Iteration:  19%|█▉        | 112/586 [00:36<02:36,  3.03it/s][A
Iteration:  19%|█▉        | 113/586 [00:37<02:35,  3.04it/s][A
Iteration:  19%|█▉        | 114/586 [00:37<02:35,  3.03it/s][A
Iteration:  20%|█▉        | 115/586 [00:37<02:35,  3.02it/s][A
Iteration:  20%|█▉        | 116/586 [00:38<02:35,  3.03it/s][A
Iteration:  20%|█▉        | 117/586 [00:

Iteration:  61%|██████    | 358/586 [01:57<01:15,  3.02it/s][A
Iteration:  61%|██████▏   | 359/586 [01:57<01:15,  3.02it/s][A
Iteration:  61%|██████▏   | 360/586 [01:58<01:14,  3.04it/s][A
Iteration:  62%|██████▏   | 361/586 [01:58<01:13,  3.05it/s][A
Iteration:  62%|██████▏   | 362/586 [01:58<01:13,  3.05it/s][A
Iteration:  62%|██████▏   | 363/586 [01:59<01:13,  3.05it/s][A
Iteration:  62%|██████▏   | 364/586 [01:59<01:13,  3.04it/s][A
Iteration:  62%|██████▏   | 365/586 [01:59<01:12,  3.05it/s][A
Iteration:  62%|██████▏   | 366/586 [02:00<01:12,  3.04it/s][A
Iteration:  63%|██████▎   | 367/586 [02:00<01:11,  3.05it/s][A
Iteration:  63%|██████▎   | 368/586 [02:00<01:10,  3.07it/s][A
Iteration:  63%|██████▎   | 369/586 [02:01<01:10,  3.08it/s][A
Iteration:  63%|██████▎   | 370/586 [02:01<01:10,  3.06it/s][A
Iteration:  63%|██████▎   | 371/586 [02:01<01:10,  3.06it/s][A
Iteration:  63%|██████▎   | 372/586 [02:02<01:09,  3.06it/s][A
Iteration:  64%|██████▎   | 373/586 [02:

In [82]:
def evaluate(args, eval_dataset, model, tokenizer, prefix=""):
    
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    input_masks = None
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
#             preds = logits.detach().cpu().numpy()
            preds = torch.argmax(logits, dim=2).detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
            input_masks = inputs['attention_mask'].detach().cpu().numpy()
        else:
#             preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            preds = np.append(preds, torch.argmax(logits, dim=2).detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
            input_masks = np.append(input_masks, inputs['attention_mask'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
#     preds = np.argmax(preds, axis=1)

    return out_label_ids, preds, input_masks


In [92]:
features = convert_examples_to_features(test_ds, label_list, label_map,
                                        max_seq_length,
                                        tokenizer,
                                        cls_token_at_end = False,
                                        cls_token=tokenizer.cls_token,
                                        cls_token_segment_id=0,
                                        sep_token=tokenizer.sep_token,
                                        sep_token_extra=False,
                                        pad_on_left=False,
                                        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                        pad_token_segment_id=0,
                                       )

In [93]:
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

In [94]:
true_ids, pred_ids, input_masks = evaluate(args, dataset, model, tokenizer)

Evaluating: 100%|██████████| 144/144 [00:14<00:00,  9.98it/s]


In [139]:
def eval_report(true_ids, pred_ids, input_masks, label_map):
    
    idx2label = {idx:label for label, idx in label_map.items()}
    
    true_labels, pred_labels = [], []
    for true_id, pred_id ,input_mask in zip(true_ids, pred_ids, input_masks):
        true_id = true_id[np.nonzero(input_mask)]
        true_label = [idx2label[i] for i in true_id]
        true_labels.append(true_label)
        
        pred_id = pred_id[np.nonzero(input_mask)]
        pred_label = [idx2label[i] for i in pred_id]
        pred_labels.append(pred_label)
        
#         print(true_labels)
#         print(pred_labels)
        
    print(classification_report(true_labels, pred_labels,digits=4))
        
eval_report(true_ids, pred_ids, input_masks, label_map)
# true_ids.shape, pred_ids.shape

             precision    recall  f1-score   support

        LOC     0.9094    0.9331    0.9211      1689
          X     0.9829    0.9821    0.9825      6267
        PER     0.9781    0.9696    0.9738      2070
       MISC     0.7796    0.7775    0.7785       737
      [CLS]     1.0000    1.0000    1.0000      3453
        ORG     0.8680    0.8917    0.8797      1800

avg / total     0.9560    0.9596    0.9577     16016



In [186]:
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
# Note that DistributedSampler samples randomly
eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

batch = iter(eval_dataloader).next()
input_ids = batch[0][0]
input_mask = batch[1][0]
input_labels = batch[3][0]

In [187]:
model.eval()
batch = tuple(t.to(args.device) for t in batch)
with torch.no_grad():
    inputs = {'input_ids':      batch[0],
              'attention_mask': batch[1],
              'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
              'labels':         batch[3]}
    outputs = model(**inputs)
loss, logits = outputs[:2]

In [188]:
idx2label = {idx:label for label, idx in label_map.items()}
eval_tokens = tokenizer.convert_ids_to_tokens(input_ids[input_mask.nonzero()].squeeze().tolist())
true_labels = [idx2label[input_label] for input_label in input_labels[input_mask.nonzero()].squeeze().tolist()]

In [189]:
torch.argmax(logits[0], dim=1)

tensor([9, 2, 2, 5, 2, 2, 2, 2, 5, 2, 2, 2, 2, 2, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 2, 9, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 9, 9, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 9, 9, 9], device='cuda:0')

In [190]:
pred_labels = torch.argmax(logits[0], dim=1)[input_mask.nonzero()].squeeze().tolist()
pred_labels = [idx2label[pred_label] for pred_label in pred_labels]

In [193]:
print(eval_tokens)

['[CLS]', 'soccer', '-', 'japan', 'get', 'lucky', 'win', ',', 'china', 'in', 'surprise', 'defeat', '.']


In [192]:
print('TRUE : ', true_labels)
print('PRED : ', pred_labels)

TRUE :  ['[CLS]', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'I-PER', 'O', 'O', 'O', 'O']
PRED :  ['[CLS]', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O']
