In [1]:
from transformers import AutoTokenizer

#加载编码器
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased',
                                          use_fast=True)

print(tokenizer)

#编码试算
tokenizer.batch_encode_plus([[
    'Hello', ',', 'this', 'is', 'first', 'sentence', 'split', 'into', 'words',
    '.'
], ['This', 'is', 'second', 'sentence', 'split', 'into', 'words', '.']],
                            is_split_into_words=True)

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


{'input_ids': [[101, 7592, 1010, 2023, 2003, 2034, 6251, 3975, 2046, 2616, 1012, 102], [101, 2023, 2003, 2117, 6251, 3975, 2046, 2616, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [2]:
#定义任务类型,这里可以取pos,chunk,ner
task = 'ner'

task

'ner'

In [3]:
from datasets import load_dataset, load_from_disk

#加载数据
#dataset = load_dataset(path='conll2003')
dataset = load_from_disk('datas/conll2003')

print('查看数据样例')
print(dataset['train'][0])

print('获取label的名字,这个后面计算评价指标的时候要用')
label_name = dataset['train'].features['%s_tags' % task].feature.names
print(label_name)

dataset

查看数据样例
{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}
获取label的名字,这个后面计算评价指标的时候要用
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14042
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3251
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3454
    })
})

In [4]:
#以下为原理说明性代码
print('就算设置了is_split_into_words=True,编码后的数字和原文里的单词也不一定是一一对应的')
print('以下面这个数据为例')
data = dataset['train'][2]
print(data)

print('调用编码')
data_encode = tokenizer.batch_encode_plus([data['tokens']],
                                          is_split_into_words=True)

print('可以看到原文里只有2个单词,但是编码后是7个词')
print(data_encode)

print('恢复成文本之后长这样')
print(tokenizer.decode(data_encode['input_ids'][0]))

print('因为有这样的情况存在,所以编码后的数字和label不能做到一一对应')
print('但是可以在编码结果上使用word_ids函数得到每个编码对应的原文索引')
print('特殊标识的索引是None,很显然,因为特殊符号不来自于原文的任何位置')
print(data_encode.word_ids(batch_index=0))

print('有了word_ids,就可以找到编码后的数字对应的label,可以做到一一对应')
print('特殊符号的label置为-100,这和模型的预训练情况有关')
label = [
    -100 if i == None else data['ner_tags'][i]
    for i in data_encode.word_ids(batch_index=0)
]
print(label)

就算设置了is_split_into_words=True,编码后的数字和原文里的单词也不一定是一一对应的
以下面这个数据为例
{'id': '2', 'tokens': ['BRUSSELS', '1996-08-22'], 'pos_tags': [22, 11], 'chunk_tags': [11, 12], 'ner_tags': [5, 0]}
调用编码
可以看到原文里只有2个单词,但是编码后是7个词
{'input_ids': [[101, 9371, 2727, 1011, 5511, 1011, 2570, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1]]}
恢复成文本之后长这样
[CLS] brussels 1996 - 08 - 22 [SEP]
因为有这样的情况存在,所以编码后的数字和label不能做到一一对应
但是可以在编码结果上使用word_ids函数得到每个编码对应的原文索引
特殊标识的索引是None,很显然,因为特殊符号不来自于原文的任何位置
[None, 0, 1, 1, 1, 1, 1, None]
有了word_ids,就可以找到编码后的数字对应的label,可以做到一一对应
特殊符号的label置为-100,这和模型的预训练情况有关
[-100, 5, 0, 0, 0, 0, 0, -100]


In [5]:
#根据以上说明性代码,写出这个数据处理函数
def tokenize_and_align_labels(data):
    #分词
    data_encode = tokenizer.batch_encode_plus(data['tokens'],
                                              truncation=True,
                                              is_split_into_words=True)

    data_encode['labels'] = []
    for i in range(len(data['tokens'])):
        label = []
        for word_id in data_encode.word_ids(batch_index=i):
            if word_id is None:
                label.append(-100)
            else:
                label.append(data['%s_tags' % task][i][word_id])

        data_encode['labels'].append(label)

    return data_encode


dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    batch_size=1000,
    num_proc=1,
    remove_columns=['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'])

print(dataset['train'][0])

dataset

Loading cached processed dataset at datas/conll2003/train/cache-87e556de43d77835.arrow
Loading cached processed dataset at datas/conll2003/validation/cache-c304fc744fc5c076.arrow
Loading cached processed dataset at datas/conll2003/test/cache-46393ddde082627e.arrow


{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14042
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3251
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3454
    })
})

In [6]:
import torch
from transformers import DataCollatorForTokenClassification

#数据加载器
loader = torch.utils.data.DataLoader(
    dataset=dataset['train'],
    batch_size=8,
    collate_fn=DataCollatorForTokenClassification(tokenizer),
    shuffle=True,
    drop_last=True,
)

for i, data in enumerate(loader):
    break

for k, v in data.items():
    print(k, v.shape, v[:2])

len(loader)

input_ids torch.Size([8, 27]) tensor([[ 101, 5548, 1017, 2624, 3799, 1014,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0],
        [ 101, 1000, 1996, 2142, 2983, 3504, 2830, 2000, 1996, 2220, 8085, 1997,
         1996, 5036, 1010, 1000, 1996, 4861, 2056, 1012,  102,    0,    0,    0,
            0,    0,    0]])
attention_mask torch.Size([8, 27]) tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
         0, 0, 0]])
labels torch.Size([8, 27]) tensor([[-100,    3,    0,    3,    4,    0, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100],
        [-100,    0,    0,    5,    6,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0

1755

In [7]:
from transformers import AutoModelForTokenClassification, DistilBertModel

#加载模型
#model = AutoModelForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_name))


#定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.pretrained = DistilBertModel.from_pretrained(
            'distilbert-base-uncased')

        self.fc = torch.nn.Sequential(torch.nn.Dropout(0.1),
                                      torch.nn.Linear(768, num_labels))

        #加载预训练模型的参数
        parameters = AutoModelForTokenClassification.from_pretrained(
            'distilbert-base-uncased', num_labels=len(label_name))
        self.fc[1].load_state_dict(parameters.classifier.state_dict())

        self.criterion = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        logits = self.pretrained(input_ids=input_ids,
                                 attention_mask=attention_mask)
        logits = logits.last_hidden_state

        logits = self.fc(logits)

        loss = None
        if labels is not None:
            loss = self.criterion(logits.flatten(end_dim=1), labels.flatten())

        return {'loss': loss, 'logits': logits}


model = Model(num_labels=len(label_name))

#统计参数量
print(sum(i.numel() for i in model.parameters()) / 10000)

out = model(**data)

out['loss'], out['logits'].shape

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_proje

6636.9801


(tensor(2.3528, grad_fn=<NllLossBackward0>), torch.Size([8, 27, 9]))

In [8]:
from datasets import load_metric

#加载评价函数
#metric = load_metric('seqeval')

#要计算必须把label转换成名字形态,有点蛋疼
#metric.compute(predictions=[label_name], references=[label_name])

In [9]:
#测试
def test():
    model.eval()

    #数据加载器
    loader_test = torch.utils.data.DataLoader(
        dataset=dataset['test'],
        batch_size=16,
        collate_fn=DataCollatorForTokenClassification(tokenizer),
        shuffle=True,
        drop_last=True,
    )

    labels = []
    outs = []
    for i, data in enumerate(loader_test):
        #计算
        with torch.no_grad():
            out = model(**data)

        out = out['logits'].argmax(dim=2)

        for j in range(16):
            #使用attention_mask筛选label,很显然,不需要pad的预测结果
            #另外首尾两个特殊符号也不需要预测结果
            select = data['attention_mask'][j] == 1
            labels.append(data['labels'][j][select][1:-1])
            outs.append(out[j][select][1:-1])

        if i % 10 == 0:
            print(i)

        if i == 50:
            break

    #计算评价指标,要计算必须把label转换成名字形态,有点蛋疼
    """labels_name = [[label_name[j] for j in i] for i in labels]
    outs_name = [[label_name[j] for j in i] for i in outs]

    metric_out = metric.compute(predictions=[outs_name],
                                references=[labels_name])

    for k in [
            'overall_precision', 'overall_recall', 'overall_f1',
            'overall_accuracy'
    ]:
        print(k, metric_out[k])"""

    #计算正确率
    labels = torch.cat(labels)
    outs = torch.cat(outs)

    print((labels == outs).sum().item() / len(labels))


test()

0
10
20
30
40
50
0.027704584807648876


In [10]:
from transformers import AdamW
from transformers.optimization import get_scheduler


#训练
def train():
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,
                              num_training_steps=len(loader),
                              optimizer=optimizer)

    model.train()
    for i, data in enumerate(loader):
        out = model(**data)
        loss = out['loss']

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        optimizer.zero_grad()
        model.zero_grad()

        if i % 50 == 0:
            labels = []
            outs = []
            out = out['logits'].argmax(dim=2)
            for j in range(8):
                #使用attention_mask筛选label,很显然,不需要pad的预测结果
                #另外首尾两个特殊符号也不需要预测结果
                select = data['attention_mask'][j] == 1
                labels.append(data['labels'][j][select][1:-1])
                outs.append(out[j][select][1:-1])

            #不知道为什么要会打印这么多warning,干脆不算了
            """#计算评价指标,要计算必须把label转换成名字形态,有点蛋疼
            labels_name = [[label_name[j] for j in i] for i in labels]
            outs_name = [[label_name[j] for j in i] for i in outs]
            
            metric_out = metric.compute(predictions=[outs_name],
                                        references=[labels_name])

            for k in [
                    'overall_precision', 'overall_recall', 'overall_f1',
                    'overall_accuracy'
            ]:
                print(k, metric_out[k])"""

            #计算正确率
            labels = torch.cat(labels)
            outs = torch.cat(outs)
            accuracy = (labels == outs).sum().item() / len(labels)

            lr = optimizer.state_dict()['param_groups'][0]['lr']

            print(i, loss.item(), accuracy, lr)

    torch.save(model, 'models/6.命名实体识别_%s.model' % task)


train()



0 2.3589563369750977 0.0379746835443038 1.998860398860399e-05
50 0.9524945020675659 0.652542372881356 1.941880341880342e-05
100 0.3112027943134308 0.9279279279279279 1.8849002849002852e-05
150 0.30391865968704224 0.925 1.827920227920228e-05
200 0.106633260846138 0.9844961240310077 1.770940170940171e-05
250 0.19684122502803802 0.9523809523809523 1.713960113960114e-05
300 0.18589724600315094 0.9150943396226415 1.6569800569800573e-05
350 0.20196762681007385 0.912621359223301 1.6000000000000003e-05
400 0.09656360000371933 0.9636363636363636 1.5430199430199432e-05
450 0.04225413128733635 0.9852941176470589 1.4860398860398862e-05
500 0.06595474481582642 0.9791666666666666 1.4290598290598293e-05
550 0.1935502141714096 0.959349593495935 1.3720797720797722e-05
600 0.062100253999233246 0.98 1.3150997150997152e-05
650 0.12392968684434891 0.9672131147540983 1.2581196581196581e-05
700 0.04459675773978233 0.9878048780487805 1.2011396011396012e-05
750 0.1767694503068924 0.9523809523809523 1.144159544

In [11]:
model = torch.load('models/6.命名实体识别_%s.model' % task)
test()

0
10
20
30
40
50
0.9709837743020027
