# [Bert-base-Chinese 微调](https://blog.csdn.net/qq_43668800/article/details/131921617)

In [1]:
import torch
from torch.optim import AdamW
from datasets import load_dataset
from transformers import BertModel, BertTokenizer

## 模型

In [2]:
# 优先使用 GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('device=', device)

device= cuda


In [3]:
# 加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')
# 需要移动到cuda上
pretrained.to(device)

# 不训练,不需要计算梯度
for param in pretrained.parameters():
    param.requires_grad_(False)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# 定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids)

        out = self.fc(out.last_hidden_state[:, 0])
        out = out.softmax(dim=1)
        return out


model = Model()
# 同样要移动到cuda
model.to(device)

Model(
  (fc): Linear(in_features=768, out_features=2, bias=True)
)

## 数据集

In [5]:
import random

# 定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        dataset = load_dataset(path='lansinuote/ChnSentiCorp', split=split)

        def f(data):
            return len(data['text']) > 40
        self.dataset = dataset.filter(f)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']
        # 切分一句话为前半句和后半句
        sentence1 = text[:20]
        sentence2 = text[20:40]
        label = 0
        # 有一半的概率把后半句替换为一句无关的话
        if random.randint(0, 1) == 0:
            j = random.randint(0, len(self.dataset) - 1)
            sentence2 = self.dataset[j]['text'][20:40]
            label = 1
        return sentence1, sentence2, label


dataset = Dataset('train')

Using custom data configuration lansinuote--ChnSentiCorp-4d058ef86e3db8d5
Reusing dataset parquet (/root/.cache/huggingface/datasets/lansinuote___parquet/lansinuote--ChnSentiCorp-4d058ef86e3db8d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/lansinuote___parquet/lansinuote--ChnSentiCorp-4d058ef86e3db8d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-3f28c458f01cdeac.arrow


In [6]:
# 加载字典和分词工具
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [7]:
def collate_fn(data):
    sents = [i[:2] for i in data]
    labels = [i[2] for i in data]

    # 编码
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=45,
                                   return_tensors='pt',
                                   return_length=True,
                                   add_special_tokens=True)
    # input_ids:编码之后的数字
    # attention_mask:是补零的位置是0,其他位置是1
    # token_type_ids:第一个句子和特殊符号的位置是0,第二个句子的位置是1
    input_ids = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)
    token_type_ids = data['token_type_ids'].to(device)
    labels = torch.LongTensor(labels).to(device)
    # print(data['length'], data['length'].max())
    return input_ids, attention_mask, token_type_ids, labels


# 数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=8,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)
for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

print(len(loader))
print(tokenizer.decode(input_ids[0]))
print(input_ids.shape, attention_mask.shape, token_type_ids.shape, labels)

1000
[CLS] 整 体 做 工 尚 可 ， 屏 幕 大 有 面 子 ， 硬 盘 大 够 肚 量 [SEP], 外 面 飞 机 的 声 音 很 大, 隔 音 效 果 很 差, 房 间 [SEP] [PAD] [PAD]
torch.Size([8, 45]) torch.Size([8, 45]) torch.Size([8, 45]) tensor([1, 0, 0, 0, 0, 1, 0, 1], device='cuda:0')


## 训练

In [8]:
import time, datetime

start_time = datetime.datetime.now()
print('Start time:', start_time)

# 训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
    out = model(input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)
    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    if i % 5 == 0:
        out = out.argmax(dim=1)
        accuracy = (out == labels).sum().item() / len(labels)
        print(i, loss.item(), accuracy)
    if i == 300:
        break


end_time = datetime.datetime.now()
print('End time:', end_time)

consume_time = end_time - start_time
print('Consume time of second:', consume_time.seconds)

Start time: 2024-03-17 12:48:00.722390
0 0.7457993030548096 0.25
5 0.7447078227996826 0.5
10 0.6512042284011841 0.5
15 0.5302249193191528 0.75
20 0.4732957184314728 1.0
25 0.5366321206092834 0.75
30 0.5576537847518921 0.75
35 0.3859046697616577 1.0
40 0.366267591714859 1.0
45 0.3605179488658905 1.0
50 0.4770599901676178 0.875
55 0.5751177072525024 0.75
60 0.4722777307033539 1.0
65 0.4826355278491974 0.75
70 0.4004383981227875 1.0
75 0.5198293924331665 0.75
80 0.41101157665252686 0.875
85 0.559878945350647 0.625
90 0.49069011211395264 0.875
95 0.5671958923339844 0.75
100 0.39849206805229187 0.875
105 0.45369070768356323 0.875
110 0.4357307553291321 0.875
115 0.4981495141983032 0.875
120 0.3690829873085022 1.0
125 0.5909870862960815 0.625
130 0.4175727963447571 0.875
135 0.34160590171813965 1.0
140 0.3885098993778229 1.0
145 0.5496517419815063 0.75
150 0.5299004316329956 0.75
155 0.3453321158885956 1.0
160 0.5472413897514343 0.75
165 0.4392426311969757 0.875
170 0.5538970828056335 0.75
1

## 测试

In [9]:
# 测试
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
        if i%5 == 0 and total != 0:
            print(i, correct / total)
        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)
        pred = out.argmax(dim=1)
        correct += (pred == labels).sum().item()
        total += len(labels)
    print(correct / total)


test()

Using custom data configuration lansinuote--ChnSentiCorp-4d058ef86e3db8d5
Reusing dataset parquet (/root/.cache/huggingface/datasets/lansinuote___parquet/lansinuote--ChnSentiCorp-4d058ef86e3db8d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/lansinuote___parquet/lansinuote--ChnSentiCorp-4d058ef86e3db8d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-b2e870d062ef5600.arrow


5 0.89375
10 0.8875
15 0.8916666666666667
20 0.871875
25 0.8725
0.8697916666666666


## 保存模型

In [10]:
import torch

model_path = '/root/workspace/model/model_bert-base-chinese/sentence_correlation.pt'
# torch.save(model.state_dict(), model_path)
torch.save(model.state_dict(), model_path)
print('模型已保存至:', model_path)

模型已保存至: /root/workspace/model/model_bert-base-chinese/sentence_correlation.pt


## 加载模型

In [11]:
my_model = Model()
my_model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [12]:
def predict(the_model, two_sent_arr):
    the_model.eval()
    # 优先使用 GPU
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('device=', device)
    the_model.to(device)
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    torch.no_grad()
    x_encode = tokenizer.batch_encode_plus(
        # 传入的所有句子，有成对句子
        batch_text_or_text_pairs=two_sent_arr,
        # 长度大于设置是否截断
        truncation=True,
        # 一律补齐，如果长度不够
        padding='max_length',
        add_special_tokens=True,
        max_length=45,
        # 可取值tf,pt,np,（tensorflow,pytorch,numpy）默认返回list
        return_tensors="pt",
        # 返回token_type_ids,第一句与特殊符号是0，第二句是1
        return_token_type_ids=True,
        # 返回attention_mask，填充是0，其他是1
        return_attention_mask=True,
        # 返回special_tokens_mask特殊符号标识，特殊是1，其他是0
        return_special_tokens_mask=True,
        # 返回长度,这里的长度是真实长度，而非设置的长度30了
        return_length=True
    )

    y = the_model(input_ids=x_encode['input_ids'].to(device),
                  attention_mask=x_encode['attention_mask'].to(device),
                  token_type_ids=x_encode['token_type_ids'].to(device))
    # print(y)
    res = y.argmax(dim=1)
    return res.cpu().numpy().tolist()


two_sents = [['这家店不错，总体环境很好，服务也很热情', '好评，希望生意越来越好'], 
             ['这家店环境太差，服务冷淡', '再也不还来'], 
             ['这家店不错，总体环境很好', '可以再考虑再次光临'],
             ['这家店不错，总体环境很好', '我要投诉']]
predict(my_model, two_sents)

device= cuda


[0, 1, 0, 1]