# 情感分类

In [13]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

from datasets import load_dataset

## 1. 定义数据集

In [2]:
class MyDataset(Dataset):
    def __init__(self, split: str):
        super(MyDataset, self).__init__()

        self.dataset = load_dataset('seamew/ChnSentiCorp', split=split)
        print(self.dataset)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        text = self.dataset['text'][item]
        label = self.dataset['label'][item]

        return text, label


dataset = MyDataset('test')
print(dataset.__len__())
# print(dataset.__getitem__(0))

Using custom data configuration default
Reusing dataset chn_senti_corp (C:\Users\Jejune\.cache\huggingface\datasets\seamew___chn_senti_corp\default\0.0.0\1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)


Dataset({
    features: ['text', 'label'],
    num_rows: 1200
})
1200


## 2. 定义tokenizer

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

tokenizer

PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

# 3. 定义批处理函数

In [4]:
def collate_fn(data):
    texts = [i[0] for i in data]
    labels = [i[1] for i in data]

    # 编码
    data = tokenizer.batch_encode_plus(
        batch_text_or_text_pairs=texts,
        truncation=True,
        padding='max_length',
        max_length=500,
        return_tensors='pt',
        return_length=True
    )
    # 编码后的数字
    input_ids = data['input_ids']
    # pad位置是0，其他位置是1
    attention_mask = data['attention_mask']
    # token_type_ids: 第一句和特殊符号是0，其余是1
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)
    return input_ids, attention_mask, token_type_ids, labels


# 保存数据
input_ids, attention_mask, token_type_ids, labels = collate_fn(dataset)
torch.save(input_ids, 'data/input_ids.pt')
torch.save(attention_mask, 'data/attention_mask.pt')
torch.save(token_type_ids, 'data/token_type_ids.pt')
torch.save(labels, 'data/labels.pt')

#数据加载器
loader = torch.utils.data.DataLoader(
    dataset=dataset,
    batch_size=16,
    collate_fn=collate_fn,
    shuffle=True,
    drop_last=True
)


In [6]:
for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

print(len(loader))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels

75


(torch.Size([16, 500]),
 torch.Size([16, 500]),
 torch.Size([16, 500]),
 tensor([1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1]))

## 4.定义Bert预训练模型

In [7]:
from transformers import BertModel

pretrained = BertModel.from_pretrained('bert-base-chinese')

#不训练,不需要计算梯度
for param in pretrained.parameters():
    param.requires_grad_(False)

# 模型试算
out = pretrained(
    input_ids=input_ids,
    attention_mask=attention_mask,
    token_type_ids=token_type_ids
)
out.last_hidden_state.shape

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([16, 500, 768])

## 5.下游任务

In [14]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc = nn.Linear(in_features=768, out_features=2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )
        out = self.fc(out.last_hidden_state[:, 0])
        out = F.softmax(out,dim=1)
        return out

model = Net()

model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids).shape

torch.Size([16, 2])

In [18]:
from NNUtils.torchwu import torchwu, cuda

# model_wu = torchwu.Model(model)
device = cuda.try_gpu()
# optim = torch.optim.AdamW(model_wu.parameters(), lr=5e-4)
# loss = torch.nn.CrossEntropyLoss()
# model_wu.compile(loss_func=loss,optimizer=optim,device=device)
# model_wu.fit(5, df)



In [19]:
from transformers import AdamW

model.to(device)
#训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    token_type_ids = token_type_ids.to(device)
    labels = labels.to(device)
    out = model(input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)

    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if i % 5 == 0:
        out = out.argmax(dim=1)
        accuracy = (out == labels).sum().item() / len(labels)

        print(i, loss.item(), accuracy)

    if i == 300:
        break



RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper__index_select)