In [1]:
import glob
import logging
import os
import json
import time

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from callback.optimizater.adamw import AdamW
from callback.lr_scheduler import get_linear_schedule_with_warmup
from callback.progressbar import ProgressBar
from tools.common import seed_everything, json_to_text
from tools.common import init_logger, logger

from models.transformers import WEIGHTS_NAME, BertConfig, AlbertConfig
from models.bert_for_ner import BertCrfForNer
from models.albert_for_ner import AlbertCrfForNer
from processors.utils_ner import CNerTokenizer, get_entities
from processors.ner_seq import convert_examples_to_features
from processors.ner_seq import ner_processors as processors
from processors.ner_seq import collate_fn
from metrics.ner_metrics import SeqEntityScore
from tools.finetuning_argparse import get_argparse

In [2]:
MODEL_CLASSES = {
    ## bert ernie bert_wwm bert_wwwm_ext
    'bert': (BertConfig, BertCrfForNer, CNerTokenizer),
    'albert': (AlbertConfig, AlbertCrfForNer, CNerTokenizer)
}

In [180]:
    output_dir = "outputs/skillner_output/bert/"
    task_name = "skillner"
    model_type = "bert"
    no_cuda = True
    local_rank = -1
    seed = 42
    
    
    adam_epsilon=1e-08
    adv_epsilon=1.0
    adv_name='word_embeddings'
    cache_dir=''
    config_name=''
    crf_learning_rate=5e-05
    data_dir='/Users/junix/code/CLUENER2020/pytorch_version/datasets/skillner/'
    do_adv=False
    do_eval=True
    do_lower_case=True
    do_predict=False
    do_train=False
    eval_all_checkpoints=False
    eval_max_seq_length=512
    evaluate_during_training=False
    fp16=False
    fp16_opt_level='O1'
    gradient_accumulation_steps=1
    learning_rate=3e-05
    local_rank=-1
    logging_steps=448
    loss_type='ce'
    markup='bios'
    max_grad_norm=1.0
    max_steps=-1
    model_name_or_path='/Users/junix/code/CLUENER2020/pytorch_version/prev_trained_model/roberta_wwm_large_ext'
    model_type='bert'
    no_cuda=False
    num_train_epochs=5.0
    output_dir='/Users/junix/code/CLUENER2020/pytorch_version/outputs/skillner_output/'
    overwrite_cache=False
    overwrite_output_dir=True
    per_gpu_eval_batch_size=24
    per_gpu_train_batch_size=24
    predict_checkpoints=0
    save_steps=448
    seed=42
    server_ip=''
    server_port=''
    task_name='skillner'
    tokenizer_name=''
    train_max_seq_length=128
    warmup_proportion=0.1
    weight_decay=0.01
    
    output_dir = output_dir + '{}'.format(model_type)

    
    time_ = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
    init_logger(log_file=output_dir + f'/{model_type}-{task_name}-{time_}.log')

 
    # Setup CUDA, GPU & distributed training
    if local_rank == -1 or no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(local_rank)
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        
    print(device, n_gpu)
    
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training",
        local_rank, device, n_gpu, bool(local_rank != -1),  )
    # Set seed
    seed_everything(seed)
    # Prepare NER task
 
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
        
    processor = processors[task_name]()
    label_list = processor.get_labels()
    id2label = {i: label for i, label in enumerate(label_list)}
    label2id = {label: i for i, label in enumerate(label_list)}
    num_labels = len(label_list)

    print(id2label)
    print(label2id)
    
    # Load pretrained model and tokenizer
    if local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
    model_type = model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
    config = config_class.from_pretrained(config_name if config_name else model_name_or_path,
                                          num_labels=num_labels, cache_dir=cache_dir if cache_dir else None, )
    tokenizer = tokenizer_class.from_pretrained(tokenizer_name if tokenizer_name else model_name_or_path,
                                                do_lower_case=do_lower_case,
                                                cache_dir=cache_dir if cache_dir else None, )
    print("model_name_or_path>>",model_name_or_path)
    model = model_class.from_pretrained(output_dir, from_tf=bool(".ckpt" in model_name_or_path),
                                        config=config, cache_dir=cache_dir if cache_dir else None)
    if local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(device)
    logger.info("Training/evaluation parameters")
    model.eval()
    
    print(tokenizer)

10/19/2021 14:05:57 - INFO - models.transformers.configuration_utils -   loading configuration file /Users/junix/code/CLUENER2020/pytorch_version/prev_trained_model/roberta_wwm_large_ext/config.json
10/19/2021 14:05:57 - INFO - models.transformers.configuration_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "lstm_dropout_prob": 0.5,
  "lstm_embedding_size": 768,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 7,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "pruned_heads": {},
  "torchscript": fal

cpu 0
{0: 'X', 1: 'B-skill', 2: 'I-skill', 3: 'S-skill', 4: 'O', 5: '[START]', 6: '[END]'}
{'X': 0, 'B-skill': 1, 'I-skill': 2, 'S-skill': 3, 'O': 4, '[START]': 5, '[END]': 6}
model_name_or_path>> /Users/junix/code/CLUENER2020/pytorch_version/prev_trained_model/roberta_wwm_large_ext


10/19/2021 14:05:59 - INFO - root -   Training/evaluation parameters


<processors.utils_ner.CNerTokenizer object at 0x7f898c230810>


In [179]:
def convert_to_features(example,
                        max_seq_length,
                        tokenizer,
                        cls_token_at_end=False,
                        cls_token="[CLS]",
                        cls_token_segment_id=1,
                        sep_token="[SEP]",
                        pad_on_left=False,
                        pad_token=0,
                        pad_token_segment_id=0,
                        sequence_a_segment_id=0,
                        mask_padding_with_zero=True, ):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """

    tokens = tokenizer.tokenize(example)
    print(tokens)
    # Account for [CLS] and [SEP] with "- 2".
    special_tokens_count = 2
    if len(tokens) > max_seq_length - special_tokens_count:
        tokens = tokens[: (max_seq_length - special_tokens_count)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids:   0   0   0   0  0     0   0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens += [sep_token]
    segment_ids = [sequence_a_segment_id] * len(tokens)

    if cls_token_at_end:
        tokens += [cls_token]
        segment_ids += [cls_token_segment_id]
    else:
        tokens = [cls_token] + tokens
        segment_ids = [cls_token_segment_id] + segment_ids

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
    input_len = len(tokens)
    # Zero-pad up to the sequence length.
#     padding_length = max_seq_length - len(input_ids)
    padding_length = 0
    if pad_on_left:
        input_ids = ([pad_token] * padding_length) + input_ids
        input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
        segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
    else:
        input_ids += [pad_token] * padding_length
        input_mask += [0 if mask_padding_with_zero else 1] * padding_length
        segment_ids += [pad_token_segment_id] * padding_length

#     assert len(input_ids) == max_seq_length
#     assert len(input_mask) == max_seq_length
#     assert len(segment_ids) == max_seq_length
 
    logger.info("*** Example ***")
    logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
    logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
    logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
    logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))

    return {
            "input_ids": torch.tensor([input_ids], dtype=torch.long),
            "attention_mask": torch.tensor([input_mask], dtype=torch.long),
            "labels": None,
            "token_type_ids" : torch.tensor([segment_ids], dtype=torch.long),
            'input_lens': torch.tensor([input_len], dtype=torch.long)
        }
    

In [195]:
texts = "掌握Java编程技巧"

In [196]:
inputs = convert_to_features(
        texts,
        max_seq_length=eval_max_seq_length,
        tokenizer=tokenizer,
        cls_token_at_end=bool(model_type in ["xlnet"]),
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
        sep_token=tokenizer.sep_token,
        pad_on_left=bool(model_type in ["xlnet"]),
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if model_type in ['xlnet'] else 0,
        sequence_a_segment_id=0,
        mask_padding_with_zero=True
    )

10/19/2021 14:27:51 - INFO - root -   *** Example ***
10/19/2021 14:27:51 - INFO - root -   tokens: [CLS] 掌 握 j a v a 编 程 技 巧 [SEP]
10/19/2021 14:27:51 - INFO - root -   input_ids: 101 2958 2995 152 143 164 143 5356 4923 2825 2341 102
10/19/2021 14:27:51 - INFO - root -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1
10/19/2021 14:27:51 - INFO - root -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0


['掌', '握', 'j', 'a', 'v', 'a', '编', '程', '技', '巧']


In [197]:
for k,v in inputs.items():
    print(f'{k:15.15} => {v}')

input_ids       => tensor([[ 101, 2958, 2995,  152,  143,  164,  143, 5356, 4923, 2825, 2341,  102]])
attention_mask  => tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
labels          => None
token_type_ids  => tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
input_lens      => tensor([12])


In [198]:
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs[0]
    tags = model.crf.decode(logits, inputs['attention_mask'])
    tags = tags.squeeze(0).cpu().numpy().tolist()

{'input_ids': tensor([[ 101, 2958, 2995,  152,  143,  164,  143, 5356, 4923, 2825, 2341,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': None, 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'input_lens': tensor([12])}
tensor([[[-2.0797, -2.3432, -2.3849, -2.3116, 11.2052, -2.3360, -2.3596],
         [-2.0579, -1.9937, -2.4639, -2.3908, 11.2299, -2.2850, -2.4132],
         [-2.0484, -2.2041, -2.3940, -2.3411, 11.2064, -2.2918, -2.3900],
         [-1.4074, 10.3601, -3.5683, -1.7622, -2.0755, -1.4275, -1.4460],
         [-2.1695, -2.1640, 11.0063, -1.9331, -2.3384, -1.9977, -2.5267],
         [-2.1978, -2.3501, 11.0439, -1.8484, -2.2360, -1.8547, -2.4301],
         [-2.2374, -2.5396, 11.0483, -1.8300, -2.2744, -1.8381, -2.3970],
         [-2.2369, -2.5577, 11.0476, -1.8914, -2.2479, -1.7714, -2.3513],
         [-2.2325, -2.7306, 11.0206, -1.8322, -2.1705, -1.6777, -2.2657],
         [-2.2263, -2.5670, 11.0455, -1.8614, -2.0625, -1.72

In [199]:
tags

[[4, 4, 4, 1, 2, 2, 2, 2, 2, 2, 2, 4]]

In [200]:
preds = tags[0][1:-1]  # [CLS]XXXX[SEP]
label_entities = get_entities(preds, id2label, markup)
label_entities

[['skill', 2, 9]]

In [201]:
id2label

{0: 'X',
 1: 'B-skill',
 2: 'I-skill',
 3: 'S-skill',
 4: 'O',
 5: '[START]',
 6: '[END]'}

In [202]:
preds

[4, 4, 1, 2, 2, 2, 2, 2, 2, 2]

In [203]:
label_entities

[['skill', 2, 9]]

In [204]:
for [ner, start, end] in label_entities:
    print(ner, "=>", texts[start:end+1])

skill => Java编程技巧
