In [1]:
import torch

from transformers import AutoTokenizer

#加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

tokenizer

BertTokenizerFast(name_or_path='google-bert/bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [2]:
from datasets import load_dataset

#加载数据集
dataset = load_dataset(path='lansinuote/ChnSentiCorp')

dataset, dataset['train'][0]

(DatasetDict({
     train: Dataset({
         features: ['text', 'label'],
         num_rows: 9600
     })
     validation: Dataset({
         features: ['text', 'label'],
         num_rows: 1200
     })
     test: Dataset({
         features: ['text', 'label'],
         num_rows: 1200
     })
 }),
 {'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  'label': 1})

In [3]:
#定义数据集遍历工具
def collate_fn(data):
    text = [i['text'] for i in data]
    label = [i['label'] for i in data]

    #文字编码
    data = tokenizer(text,
                     padding=True,
                     truncation=True,
                     max_length=500,
                     return_tensors='pt',
                     return_token_type_ids=False)

    #设置label
    data['label'] = torch.LongTensor(label)

    return data


loader = torch.utils.data.DataLoader(dataset['train'],
                                     batch_size=8,
                                     shuffle=True,
                                     drop_last=True,
                                     collate_fn=collate_fn)

data = next(iter(loader))

for k, v in data.items():
    print(k, v.shape)

len(loader)

input_ids torch.Size([8, 185])
attention_mask torch.Size([8, 185])
label torch.Size([8])


1200

In [4]:
#定义模型
class Model(torch.nn.Module):

    def __init__(self):
        super().__init__()

        #加载预训练模型
        from transformers import AutoModel
        self.pretrained = AutoModel.from_pretrained(
            'google-bert/bert-base-chinese')

        self.fc = torch.nn.Linear(in_features=768, out_features=2)

    def forward(self, input_ids, attention_mask, label=None):
        #使用预训练模型抽取数据特征
        with torch.no_grad():
            last_hidden_state = self.pretrained(
                input_ids=input_ids,
                attention_mask=attention_mask).last_hidden_state

        #只取第0个词的特征做分类,这和bert模型的训练方式有关,此处不展开
        last_hidden_state = last_hidden_state[:, 0]

        #对抽取的特征只取第一个字的结果做分类即可
        out = self.fc(last_hidden_state).softmax(dim=1)

        #计算loss
        loss = None
        if label is not None:
            loss = torch.nn.functional.cross_entropy(out, label)

        return loss, out


model = Model()

model(**data)

(tensor(0.6372, grad_fn=<NllLossBackward0>),
 tensor([[0.7080, 0.2920],
         [0.6366, 0.3634],
         [0.5499, 0.4501],
         [0.5178, 0.4822],
         [0.5974, 0.4026],
         [0.6423, 0.3577],
         [0.5603, 0.4397],
         [0.5908, 0.4092]], grad_fn=<SoftmaxBackward0>))

In [5]:
#执行训练
def train():
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    for i, data in enumerate(loader):
        loss, out = model(**data)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if i % 10 == 0:
            out = out.argmax(dim=1)
            acc = (out == data.label).sum().item() / len(data.label)
            print(i, len(loader), loss.item(), acc)

        if i == 300:
            break


train()

0 1200 0.6553741693496704 0.75
10 1200 0.6900417804718018 0.5
20 1200 0.6871982216835022 0.375
30 1200 0.7322638630867004 0.375
40 1200 0.6802827715873718 0.625
50 1200 0.6234102249145508 0.75
60 1200 0.665240466594696 0.75
70 1200 0.6254176497459412 0.875
80 1200 0.6244698762893677 0.875
90 1200 0.6492530107498169 0.875
100 1200 0.6473492383956909 0.5
110 1200 0.7369059324264526 0.5
120 1200 0.5942272543907166 0.75
130 1200 0.612605094909668 0.875
140 1200 0.5870425701141357 1.0
150 1200 0.5847880840301514 0.75
160 1200 0.5123168230056763 1.0
170 1200 0.6124182939529419 0.625
180 1200 0.5733353495597839 1.0
190 1200 0.5715835094451904 0.75
200 1200 0.5075149536132812 1.0
210 1200 0.5954742431640625 0.875
220 1200 0.7148230075836182 0.5
230 1200 0.4510529339313507 1.0
240 1200 0.6033474802970886 0.75
250 1200 0.5930602550506592 0.625
260 1200 0.511476993560791 1.0
270 1200 0.4931146800518036 1.0
280 1200 0.520031750202179 0.75
290 1200 0.550284206867218 0.875
300 1200 0.565074324607849

In [6]:
#执行测试
def test():
    loader_test = torch.utils.data.DataLoader(dataset['test'],
                                              batch_size=8,
                                              shuffle=True,
                                              drop_last=True,
                                              collate_fn=collate_fn)

    correct = 0
    total = 0
    for i, data in enumerate(loader_test):
        with torch.no_grad():
            _, out = model(**data)

        out = out.argmax(dim=1)
        correct += (out == data.label).sum().item()
        total += len(data.label)

        print(i, len(loader_test), correct / total)

        if i == 5:
            break

    return correct / total


test()

0 150 1.0
1 150 0.9375
2 150 0.875
3 150 0.90625
4 150 0.875
5 150 0.875


0.875