In [1]:
#全局变量
hub_token = open('/root/hub_token.txt').read().strip()
repo_id = 'lansinuote/nlp.6.named_entity_recognition'
push_to_hub = True

In [2]:
from transformers import AutoTokenizer

#加载编码器
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased',
                                          use_fast=True)

print(tokenizer)

#编码试算
tokenizer.batch_encode_plus([[
    'Hello', ',', 'this', 'is', 'first', 'sentence', 'split', 'into', 'words',
    '.'
], ['This', 'is', 'second', 'sentence', 'split', 'into', 'words', '.']],
                            is_split_into_words=True)

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


{'input_ids': [[101, 7592, 1010, 2023, 2003, 2034, 6251, 3975, 2046, 2616, 1012, 102], [101, 2023, 2003, 2117, 6251, 3975, 2046, 2616, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [3]:
#可以直接使用我处理好的数据集,这段代码可以跳过不看
#以下为原理说明性代码
def _():
    print('就算设置了is_split_into_words=True,编码后的数字和原文里的单词也不一定是一一对应的')
    print('以下面这个数据为例')
    data = {
        'id': '2',
        'tokens': ['BRUSSELS', '1996-08-22'],
        'pos_tags': [22, 11],
        'chunk_tags': [11, 12],
        'ner_tags': [5, 0]
    }
    print(data)

    print('调用编码')
    data_encode = tokenizer.batch_encode_plus([data['tokens']],
                                              is_split_into_words=True)

    print('可以看到原文里只有2个单词,但是编码后是7个词')
    print(data_encode)

    print('恢复成文本之后长这样')
    print(tokenizer.decode(data_encode['input_ids'][0]))

    print('因为有这样的情况存在,所以编码后的数字和label不能做到一一对应')
    print('但是可以在编码结果上使用word_ids函数得到每个编码对应的原文索引')
    print('特殊标识的索引是None,很显然,因为特殊符号不来自于原文的任何位置')
    print(data_encode.word_ids(batch_index=0))

    print('有了word_ids,就可以找到编码后的数字对应的label,可以做到一一对应')
    print('特殊符号的label置为-100,这和模型的预训练情况有关')
    label = [
        -100 if i == None else data['ner_tags'][i]
        for i in data_encode.word_ids(batch_index=0)
    ]
    print(label)


_()

就算设置了is_split_into_words=True,编码后的数字和原文里的单词也不一定是一一对应的
以下面这个数据为例
{'id': '2', 'tokens': ['BRUSSELS', '1996-08-22'], 'pos_tags': [22, 11], 'chunk_tags': [11, 12], 'ner_tags': [5, 0]}
调用编码
可以看到原文里只有2个单词,但是编码后是7个词
{'input_ids': [[101, 9371, 2727, 1011, 5511, 1011, 2570, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1]]}
恢复成文本之后长这样
[CLS] brussels 1996 - 08 - 22 [SEP]
因为有这样的情况存在,所以编码后的数字和label不能做到一一对应
但是可以在编码结果上使用word_ids函数得到每个编码对应的原文索引
特殊标识的索引是None,很显然,因为特殊符号不来自于原文的任何位置
[None, 0, 1, 1, 1, 1, 1, None]
有了word_ids,就可以找到编码后的数字对应的label,可以做到一一对应
特殊符号的label置为-100,这和模型的预训练情况有关
[-100, 5, 0, 0, 0, 0, 0, -100]


In [4]:
from datasets import load_dataset


def get_dataset():
    #加载数据
    dataset = load_dataset(path='conll2003')

    print('查看数据样例')
    print(dataset, dataset['train'][0])

    #根据以上说明性代码,写出这个数据处理函数
    def tokenize_and_align_labels(data):
        #分词
        data_encode = tokenizer.batch_encode_plus(data['tokens'],
                                                  truncation=True,
                                                  is_split_into_words=True)

        data_encode['labels'] = []
        for i in range(len(data['tokens'])):
            label = []
            for word_id in data_encode.word_ids(batch_index=i):
                if word_id is None:
                    label.append(-100)
                else:
                    label.append(data['ner_tags'][i][word_id])

            data_encode['labels'].append(label)

        return data_encode

    dataset = dataset.map(
        tokenize_and_align_labels,
        batched=True,
        batch_size=1000,
        num_proc=1,
        remove_columns=['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'])

    return dataset


if push_to_hub:
    dataset = get_dataset()
    dataset.push_to_hub(repo_id=repo_id, token=hub_token)

#直接使用我处理好的数据集
dataset = load_dataset(path=repo_id)

print(dataset, dataset['train'][0])

Found cached dataset conll2003 (/root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

查看数据样例
DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
}) {'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


  0%|          | 0/15 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-5642b953c0a4ff28.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-652229bad444d43f.arrow
Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.
Resuming upload of the dataset shards.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.
Resuming upload of the dataset shards.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading readme:   0%|          | 0.00/586 [00:00<?, ?B/s]

Using custom data configuration lansinuote--nlp.6.named_entity_recognition-a0ecf5001aea9e74


Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/lansinuote___parquet/lansinuote--nlp.6.named_entity_recognition-a0ecf5001aea9e74/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/194k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/774k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/184k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/lansinuote___parquet/lansinuote--nlp.6.named_entity_recognition-a0ecf5001aea9e74/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
}) {'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}


In [5]:
import torch
from transformers import DataCollatorForTokenClassification

#数据加载器
loader = torch.utils.data.DataLoader(
    dataset=dataset['train'],
    batch_size=8,
    collate_fn=DataCollatorForTokenClassification(tokenizer),
    shuffle=True,
    drop_last=True,
)

for i, data in enumerate(loader):
    break

for k, v in data.items():
    print(k, v.shape, v[:2])

len(loader)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids torch.Size([8, 35]) tensor([[ 101, 5865, 1018, 3190, 1017,  102,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 6278, 1022, 5395, 1020,  102,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]])
attention_mask torch.Size([8, 35]) tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
labels torch.Size([8, 35]) tensor([[-100,    3,    0,    3,    0, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100,

1755

In [6]:
from transformers import AutoModelForTokenClassification, DistilBertModel, PreTrainedModel, PretrainedConfig

#加载模型
#model = AutoModelForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_name))

#定义下游任务模型
class Model(PreTrainedModel):
    config_class = PretrainedConfig

    def __init__(self, config):
        super().__init__(config)

        self.pretrained = DistilBertModel.from_pretrained(
            'distilbert-base-uncased')

        #9 = len(dataset['train'].features['ner_tags'].feature.names)
        self.fc = torch.nn.Sequential(torch.nn.Dropout(0.1),
                                      torch.nn.Linear(768, 9))

        #加载预训练模型的参数
        parameters = AutoModelForTokenClassification.from_pretrained(
            'distilbert-base-uncased', num_labels=9)
        self.fc[1].load_state_dict(parameters.classifier.state_dict())

        self.criterion = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        logits = self.pretrained(input_ids=input_ids,
                                 attention_mask=attention_mask)
        logits = logits.last_hidden_state

        logits = self.fc(logits)

        loss = None
        if labels is not None:
            loss = self.criterion(logits.flatten(end_dim=1), labels.flatten())

        return {'loss': loss, 'logits': logits}


model = Model(PretrainedConfig())

#统计参数量
print(sum(i.numel() for i in model.parameters()) / 10000)

out = model(**data)

out['loss'], out['logits'].shape

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_

6636.9801


(tensor(2.2111, grad_fn=<NllLossBackward0>), torch.Size([8, 35, 9]))

In [7]:
#测试
def test():
    model.eval()

    #数据加载器
    loader_test = torch.utils.data.DataLoader(
        dataset=dataset['test'],
        batch_size=16,
        collate_fn=DataCollatorForTokenClassification(tokenizer),
        shuffle=True,
        drop_last=True,
    )

    labels = []
    outs = []
    for i, data in enumerate(loader_test):
        #计算
        with torch.no_grad():
            out = model(**data)

        out = out['logits'].argmax(dim=2)

        for j in range(16):
            #使用attention_mask筛选label,很显然,不需要pad的预测结果
            #另外首尾两个特殊符号也不需要预测结果
            select = data['attention_mask'][j] == 1
            labels.append(data['labels'][j][select][1:-1])
            outs.append(out[j][select][1:-1])

        if i % 10 == 0:
            print(i)

        if i == 50:
            break

    #计算正确率
    labels = torch.cat(labels)
    outs = torch.cat(outs)

    print((labels == outs).sum().item() / len(labels))


test()

0
10
20
30
40
50
0.060054857802800635


In [8]:
from transformers import AdamW
from transformers.optimization import get_scheduler


#训练
def train():
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,
                              num_training_steps=len(loader),
                              optimizer=optimizer)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.train()
    model.to(device)
    for i, data in enumerate(loader):
        for k in data.keys():
            data[k] = data[k].to(device)
            
        out = model(**data)
        loss = out['loss']

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        optimizer.zero_grad()
        model.zero_grad()

        if i % 50 == 0:
            labels = []
            outs = []
            out = out['logits'].argmax(dim=2)
            for j in range(8):
                #使用attention_mask筛选label,很显然,不需要pad的预测结果
                #另外首尾两个特殊符号也不需要预测结果
                select = data['attention_mask'][j] == 1
                labels.append(data['labels'][j][select][1:-1])
                outs.append(out[j][select][1:-1])

            #计算正确率
            labels = torch.cat(labels)
            outs = torch.cat(outs)
            accuracy = (labels == outs).sum().item() / len(labels)

            lr = optimizer.state_dict()['param_groups'][0]['lr']

            print(i, loss.item(), accuracy, lr)

    model.to('cpu')


if push_to_hub:
    train()
    model.push_to_hub(repo_id=repo_id, use_auth_token=hub_token)



0 2.1965272426605225 0.12121212121212122 1.998860398860399e-05
50 0.48114097118377686 0.8389830508474576 1.941880341880342e-05
100 0.4562705457210541 0.8601398601398601 1.8849002849002852e-05
150 0.2223113477230072 0.9166666666666666 1.827920227920228e-05
200 0.0960574522614479 0.9793103448275862 1.770940170940171e-05
250 0.12752580642700195 0.9477611940298507 1.713960113960114e-05
300 0.14465288817882538 0.9716981132075472 1.6569800569800573e-05
350 0.0917036160826683 0.9774436090225563 1.6000000000000003e-05
400 0.03730904683470726 0.9937106918238994 1.5430199430199432e-05
450 0.117700956761837 0.9852941176470589 1.4860398860398862e-05
500 0.10514380782842636 0.9803921568627451 1.4290598290598293e-05
550 0.1947314739227295 0.9603960396039604 1.3720797720797722e-05
600 0.15881414711475372 0.9622641509433962 1.3150997150997152e-05
650 0.184824138879776 0.9182389937106918 1.2581196581196581e-05
700 0.09858043491840363 0.9647887323943662 1.2011396011396012e-05
750 0.005401435773819685 1.

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/266M [00:00<?, ?B/s]

In [9]:
#直接使用我训练好的模型
model = Model.from_pretrained(repo_id)

test()

Downloading (…)lve/main/config.json:   0%|          | 0.00/105 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/266M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_

0
10
20
30
40
50
0.973296013314169
