In [1]:
from transformers import AutoTokenizer

#加载编码器
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased',
                                          use_fast=True)

print(tokenizer)

#编码试算
tokenizer.batch_encode_plus([
    'hide new secretions from the parental units',
    'contains no wit , only labored gags'
])

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


{'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102], [101, 3397, 2053, 15966, 1010, 2069, 4450, 2098, 18201, 2015, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [2]:
from datasets import load_dataset, load_from_disk

#加载数据
#dataset = load_dataset(path='glue', name='cola')
dataset = load_from_disk('datas/glue/cola')


#分词,同时删除多余的字段
def f(examples):
    return tokenizer.batch_encode_plus(examples['sentence'], truncation=True)


dataset = dataset.map(function=f,
                      batched=True,
                      batch_size=1000,
                      num_proc=4,
                      remove_columns=['sentence', 'idx'])

print(dataset['train'][0])

dataset

 

Loading cached processed dataset at datas/glue/cola/train/cache-6def1c5f8a327c94.arrow


 

Loading cached processed dataset at datas/glue/cola/train/cache-31d27e0837d27207.arrow


 

Loading cached processed dataset at datas/glue/cola/train/cache-2f99156118cc2a5d.arrow


 

Loading cached processed dataset at datas/glue/cola/train/cache-89989a31ab1075b6.arrow


 

Loading cached processed dataset at datas/glue/cola/validation/cache-6b7c3e2e8bff6006.arrow


 

Loading cached processed dataset at datas/glue/cola/validation/cache-dd12b7baf47b5578.arrow


 

Loading cached processed dataset at datas/glue/cola/validation/cache-857d56c2dc1f0ef3.arrow


 

Loading cached processed dataset at datas/glue/cola/validation/cache-c2a54676cfb020f7.arrow


 

Loading cached processed dataset at datas/glue/cola/test/cache-fe1d9a8362ec0a58.arrow


 

Loading cached processed dataset at datas/glue/cola/test/cache-301de2aa72a796c4.arrow


 

Loading cached processed dataset at datas/glue/cola/test/cache-ba8c1110f769f8ff.arrow


 

Loading cached processed dataset at datas/glue/cola/test/cache-8e279bffbe110091.arrow


{'label': 1, 'input_ids': [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1063
    })
})

In [3]:
import torch
from transformers.data.data_collator import DataCollatorWithPadding

#数据加载器
loader = torch.utils.data.DataLoader(
    dataset=dataset['train'],
    batch_size=8,
    collate_fn=DataCollatorWithPadding(tokenizer),
    shuffle=True,
    drop_last=True,
)

for i, data in enumerate(loader):
    break

for k, v in data.items():
    print(k, v.shape, v[:3])

len(loader)

input_ids torch.Size([8, 17]) tensor([[  101,  1996,  2062,  2008,  2017,  4521,  1010,  1996,  2625,  2008,
          2017,  2215,  1012,   102,     0,     0,     0],
        [  101,  2029,  4905,  2097,  2507,  1996,  5494,  6685,  8440,  1005,
          1056,  2042,  2787,  2011,  2068,  1012,   102],
        [  101,  1045,  2052,  3153,  2007, 10334,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0]])
attention_mask torch.Size([8, 17]) tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
labels torch.Size([8]) tensor([1, 1, 1])


1068

In [4]:
from transformers import AutoModelForSequenceClassification, DistilBertModel

#加载模型
#model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)


#定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.pretrained = DistilBertModel.from_pretrained(
            'distilbert-base-uncased')

        self.fc = torch.nn.Sequential(torch.nn.Linear(768, 768),
                                      torch.nn.ReLU(), torch.nn.Dropout(p=0.2),
                                      torch.nn.Linear(768, 2))

        #加载预训练模型的参数
        parameters = AutoModelForSequenceClassification.from_pretrained(
            'distilbert-base-uncased', num_labels=2)
        self.fc[0].load_state_dict(parameters.pre_classifier.state_dict())
        self.fc[3].load_state_dict(parameters.classifier.state_dict())

        self.criterion = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        logits = self.pretrained(input_ids=input_ids,
                                 attention_mask=attention_mask)
        logits = logits.last_hidden_state[:, 0]
        logits = self.fc(logits)

        loss = None
        if labels is not None:
            loss = self.criterion(logits, labels)

        return {'loss': loss, 'logits': logits}


model = Model()

#统计参数量
print(sum(i.numel() for i in model.parameters()) / 10000)

out = model(**data)

out['loss'], out['logits'].shape

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_tr

6695.501


(tensor(0.6933, grad_fn=<NllLossBackward0>), torch.Size([8, 2]))

In [5]:
from datasets import load_metric

#加载评价函数
metric = load_metric(path='glue', config_name='cola')

#试算
metric.compute(predictions=[0, 1, 1, 0], references=[0, 1, 1, 1])

Using the latest cached version of the module from /root/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Mon Jul 11 13:08:31 2022) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.


{'matthews_correlation': 0.5773502691896258}

In [6]:
#测试
def test():
    model.eval()

    #数据加载器
    loader_test = torch.utils.data.DataLoader(
        dataset=dataset['validation'],
        batch_size=16,
        collate_fn=DataCollatorWithPadding(tokenizer),
        shuffle=True,
        drop_last=True,
    )

    outs = []
    labels = []
    for i, data in enumerate(loader_test):
        #计算
        with torch.no_grad():
            out = model(**data)

        outs.append(out['logits'].argmax(dim=1))
        labels.append(data['labels'])

        if i % 10 == 0:
            print(i)

        if i == 50:
            break

    outs = torch.cat(outs)
    labels = torch.cat(labels)

    accuracy = (outs == labels).sum().item() / len(labels)
    metric_out = metric.compute(predictions=outs, references=labels)

    print(accuracy, metric_out)


test()

0
10
20
30
40
50
0.6446078431372549 {'matthews_correlation': 0.013769908528691366}


In [7]:
from transformers import AdamW
from transformers.optimization import get_scheduler


#训练
def train():
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,
                              num_training_steps=len(loader),
                              optimizer=optimizer)

    model.train()
    for i, data in enumerate(loader):
        out = model(**data)
        loss = out['loss']

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        optimizer.zero_grad()
        model.zero_grad()

        if i % 50 == 0:
            out = out['logits'].argmax(dim=1)

            accuracy = (data['labels'] == out).sum().item() / 8
            metric_out = metric.compute(predictions=out,
                                        references=data['labels'])

            lr = optimizer.state_dict()['param_groups'][0]['lr']

            print(i, loss.item(), accuracy, metric_out, lr)

    torch.save(model, 'models/5.分类.model')


train()



0 0.6523979902267456 0.75 {'matthews_correlation': 0.3333333333333333} 1.9981273408239703e-05


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


50 0.6507331728935242 0.625 {'matthews_correlation': 0.0} 1.9044943820224723e-05


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


100 0.7072427272796631 0.5 {'matthews_correlation': 0.0} 1.810861423220974e-05


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


150 0.4458800256252289 0.75 {'matthews_correlation': 0.0} 1.7172284644194758e-05
200 0.3876158595085144 0.75 {'matthews_correlation': -0.14285714285714285} 1.6235955056179777e-05
250 0.49931180477142334 0.625 {'matthews_correlation': 0.14907119849998599} 1.5299625468164797e-05
300 0.3381071388721466 0.875 {'matthews_correlation': 0.7453559924999299} 1.4363295880149814e-05
350 0.5246998071670532 0.75 {'matthews_correlation': -0.14285714285714285} 1.3426966292134834e-05
400 0.3265834152698517 0.75 {'matthews_correlation': 0.4879500364742666} 1.2490636704119851e-05
450 0.6134941577911377 0.625 {'matthews_correlation': 0.14907119849998599} 1.155430711610487e-05
500 0.6472000479698181 0.75 {'matthews_correlation': 0.5} 1.0617977528089888e-05
550 0.5875465273857117 0.5 {'matthews_correlation': 0.0} 9.681647940074908e-06
600 0.5096354484558105 0.875 {'matthews_correlation': 0.7745966692414834} 8.745318352059925e-06
650 0.3630077540874481 0.75 {'matthews_correlation': -0.14285714285714285} 7.8

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


700 0.43435221910476685 0.75 {'matthews_correlation': 0.0} 6.872659176029963e-06
750 0.295941561460495 0.875 {'matthews_correlation': 0.7745966692414834} 5.936329588014982e-06
800 0.4042346477508545 0.75 {'matthews_correlation': 0.4879500364742666} 5e-06
850 0.46937647461891174 0.75 {'matthews_correlation': -0.14285714285714285} 4.063670411985019e-06
900 0.8482968807220459 0.5 {'matthews_correlation': -0.29277002188455997} 3.1273408239700374e-06
950 0.4539779722690582 0.875 {'matthews_correlation': 0.7453559924999299} 2.1910112359550564e-06


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


1000 0.1318509876728058 1.0 {'matthews_correlation': 0.0} 1.2546816479400751e-06
1050 0.3866274952888489 0.75 {'matthews_correlation': -0.14285714285714285} 3.183520599250937e-07


In [8]:
model = torch.load('models/5.分类.model')
test()

0
10
20
30
40
50
0.7769607843137255 {'matthews_correlation': 0.4382819245206996}
