In [1]:
#全局变量
push_to_hub = True

In [2]:
from transformers import AutoTokenizer

#加载编码器
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased',
                                          use_fast=True)

print(tokenizer)

#编码试算
tokenizer.batch_encode_plus([[
    'Hello', ',', 'this', 'is', 'first', 'sentence', 'split', 'into', 'words',
    '.'
], ['This', 'is', 'second', 'sentence', 'split', 'into', 'words', '.']],
                            is_split_into_words=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


{'input_ids': [[101, 7592, 1010, 2023, 2003, 2034, 6251, 3975, 2046, 2616, 1012, 102], [101, 2023, 2003, 2117, 6251, 3975, 2046, 2616, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [5]:
from datasets import load_dataset


def get_dataset():
    #加载数据
    dataset = load_dataset(path='conll2003')

    print('查看数据样例')
    print(dataset, dataset['train'][0])

    #根据以上说明性代码,写出这个数据处理函数
    def tokenize_and_align_labels(data):
        #分词
        data_encode = tokenizer.batch_encode_plus(data['tokens'],
                                                  truncation=True,
                                                  is_split_into_words=True)

        data_encode['labels'] = []
        for i in range(len(data['tokens'])):
            label = []
            for word_id in data_encode.word_ids(batch_index=i):
                if word_id is None:
                    label.append(-100)
                else:
                    label.append(data['ner_tags'][i][word_id])

            data_encode['labels'].append(label)

        return data_encode

    dataset = dataset.map(
        tokenize_and_align_labels,
        batched=True,
        batch_size=1000,
        num_proc=1,
        remove_columns=['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'])

    return dataset


if push_to_hub:
    dataset = get_dataset()
    #dataset.push_to_hub(repo_id=repo_id, token=hub_token)

#直接使用我处理好的数据集
#dataset = load_dataset(path=repo_id)
dataset.save_to_disk(dataset_dict_path='c:\\temp\\231017000028\\src\\dataset\\')
print(dataset, dataset['train'][0])

Reusing dataset conll2003 (C:\Users\Administrator\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

查看数据样例
DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
}) {'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
}) {'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}


In [6]:
import torch
from transformers import DataCollatorForTokenClassification

#数据加载器
loader = torch.utils.data.DataLoader(
    dataset=dataset['train'],
    batch_size=8,
    collate_fn=DataCollatorForTokenClassification(tokenizer),
    shuffle=True,
    drop_last=True,
)

for i, data in enumerate(loader):
    break

for k, v in data.items():
    print(k, v.shape, v[:2])

len(loader)

input_ids torch.Size([8, 54]) tensor([[  101, 14824,  1005,  1055,  7794,  7920,  2003,  2025,  2092,  2438,
          2000,  5463,  2021,  1037,  4471,  2013,  2014,  2097,  2022,  3191,
          2041,  2011,  1996, 19938,  1005,  1055,  2882,  1011,  2684, 17508,
          6229,  2386,  2076,  1996,  3116,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101,  7367,  4244,  1010,  5479,  1011,  2039,  2000, 22160,  2197,
          2095,  1010,  2003, 13916,  2000,  2448,  2046,  3587,  1011,  4396,
          2446,  2019,  3489,  9594,  2121,  1999,  1996,  4284,  1011,  4399,
          2007,  2959,  6534,  9530,  5428,  2696, 10337,  2030,  5964,  1011,
         13916,  4386,  3410, 12110, 16273,  2559,  2066,  2014,  2087,  3497,
         16797,  7892,  1012,   102]])
attention_mask torch.Size([8, 54]) tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

1755

In [7]:
from transformers import AutoModelForTokenClassification, DistilBertModel, PreTrainedModel, PretrainedConfig

#加载模型
#model = AutoModelForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_name))

#定义下游任务模型
class Model(PreTrainedModel):
    config_class = PretrainedConfig

    def __init__(self, config):
        super().__init__(config)

        self.pretrained = DistilBertModel.from_pretrained(
            'distilbert-base-uncased')

        #9 = len(dataset['train'].features['ner_tags'].feature.names)
        self.fc = torch.nn.Sequential(torch.nn.Dropout(0.1),
                                      torch.nn.Linear(768, 9))

        #加载预训练模型的参数
        parameters = AutoModelForTokenClassification.from_pretrained(
            'distilbert-base-uncased', num_labels=9)
        self.fc[1].load_state_dict(parameters.classifier.state_dict())

        self.criterion = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        logits = self.pretrained(input_ids=input_ids,
                                 attention_mask=attention_mask)
        logits = logits.last_hidden_state

        logits = self.fc(logits)

        loss = None
        if labels is not None:
            loss = self.criterion(logits.flatten(end_dim=1), labels.flatten())

        return {'loss': loss, 'logits': logits}


model = Model(PretrainedConfig())

#统计参数量
print(sum(i.numel() for i in model.parameters()) / 10000)

out = model(**data)

out['loss'], out['logits'].shape

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_trans

6636.9801


(tensor(2.3947, grad_fn=<NllLossBackward0>), torch.Size([8, 54, 9]))

In [10]:
#测试
def test():
    model.eval()

    #数据加载器
    loader_test = torch.utils.data.DataLoader(
        dataset=dataset['test'],
        batch_size=16,
        collate_fn=DataCollatorForTokenClassification(tokenizer),
        shuffle=True,
        drop_last=True,
    )

    labels = []
    outs = []
    for i, data in enumerate(loader_test):
        #计算
        with torch.no_grad():
            out = model(**data)

        out = out['logits'].argmax(dim=2)

        for j in range(16):
            #使用attention_mask筛选label,很显然,不需要pad的预测结果
            #另外首尾两个特殊符号也不需要预测结果
            select = data['attention_mask'][j] == 1
            labels.append(data['labels'][j][select][1:-1])
            outs.append(out[j][select][1:-1])

        if i % 10 == 0:
            print(i)

        if i == 50:
            break

    #计算正确率
    labels = torch.cat(labels)
    outs = torch.cat(outs)

    print((labels == outs).sum().item() / len(labels))


test()

0
10
20
30
40
50
0.9699435028248587


In [9]:
from transformers import AdamW
from transformers.optimization import get_scheduler


#训练
def train():
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,
                              num_training_steps=len(loader),
                              optimizer=optimizer)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.train()
    model.to(device)
    for i, data in enumerate(loader):
        for k in data.keys():
            data[k] = data[k].to(device)
            
        out = model(**data)
        loss = out['loss']

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        optimizer.zero_grad()
        model.zero_grad()

        if i % 50 == 0:
            labels = []
            outs = []
            out = out['logits'].argmax(dim=2)
            for j in range(8):
                #使用attention_mask筛选label,很显然,不需要pad的预测结果
                #另外首尾两个特殊符号也不需要预测结果
                select = data['attention_mask'][j] == 1
                labels.append(data['labels'][j][select][1:-1])
                outs.append(out[j][select][1:-1])

            #计算正确率
            labels = torch.cat(labels)
            outs = torch.cat(outs)
            accuracy = (labels == outs).sum().item() / len(labels)

            lr = optimizer.state_dict()['param_groups'][0]['lr']

            print(i, loss.item(), accuracy, lr)

    model.to('cpu')


if push_to_hub:
    train()
    #model.push_to_hub(repo_id=repo_id, use_auth_token=hub_token)



0 2.399038553237915 0.024390243902439025 1.998860398860399e-05
50 0.5870899558067322 0.8269230769230769 1.941880341880342e-05
100 0.51480633020401 0.8372093023255814 1.8849002849002852e-05
150 0.13835808634757996 0.9583333333333334 1.827920227920228e-05
200 0.13117648661136627 0.9669421487603306 1.770940170940171e-05
250 0.23385891318321228 0.94 1.713960113960114e-05
300 0.1472039669752121 0.9512195121951219 1.6569800569800573e-05
350 0.19199655950069427 0.9505494505494505 1.6000000000000003e-05
400 0.0964977890253067 0.9759036144578314 1.5430199430199432e-05
450 0.5718852877616882 0.8689655172413793 1.4860398860398862e-05
500 0.14928022027015686 0.9669421487603306 1.4290598290598293e-05
550 0.04728260636329651 0.9913793103448276 1.3720797720797722e-05
600 0.06350374221801758 0.9777777777777777 1.3150997150997152e-05
650 0.17678901553153992 0.9712230215827338 1.2581196581196581e-05
700 0.12549585103988647 0.96875 1.2011396011396012e-05
750 0.06563395261764526 0.9820627802690582 1.14415

In [11]:
test()

0
10
20
30
40
50
0.9717212202182142
