# 数据封装

In [5]:
import pandas as pd
import torch
df = pd.read_csv("dataset/train/weibo_train_data.csv")
df = df.dropna()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 划分数据集
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['review'].values,
                                                    df['label'].values,
                                                    train_size=0.9,
                                                    random_state=100)
from torch.utils.data import Dataset, DataLoader
import torch

class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

    def __len__(self):
        return len(self.X)


train_dataset = MyDataset(X_train, y_train)
test_dataset = MyDataset(X_test, y_test)

batch_size = 32
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# 模型

In [None]:
from torch import nn
from transformers import BertModel, BertTokenizer
from transformers import AdamW
from tqdm import tqdm

num_class=2

class BertClassificationModel(nn.Module):
    def __init__(self,hidden_size=768): # bert默认最后输出维度为768
        super(BertClassificationModel, self).__init__()
        model_name = 'bert-base-chinese'
        # 读取分词器
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
        # 读取预训练模型
        self.bert = BertModel.from_pretrained(pretrained_model_name_or_path=model_name)

        for p in self.bert.parameters(): # 冻结bert参数
            p.requires_grad = False
        self.fc = nn.Linear(hidden_size, num_class)

    def forward(self, batch_sentences): # [batch_size,1]
        # 编码
        sentences_tokenizer = self.tokenizer(batch_sentences,
                                             truncation=True,
                                             padding=True,
                                             max_length=512,
                                             add_special_tokens=True)
        input_ids=torch.tensor(sentences_tokenizer['input_ids']).to(device) # 变量
        attention_mask=torch.tensor(sentences_tokenizer['attention_mask']).to(device) # 变量
        bert_out=self.bert(input_ids=input_ids,attention_mask=attention_mask) # 模型

        last_hidden_state =bert_out[0].to(device) # [batch_size, sequence_length, hidden_size] # 变量
        bert_cls_hidden_state=last_hidden_state[:,0,:].to(device) # 变量
        fc_out=self.fc(bert_cls_hidden_state) # 模型
        return fc_out

model=BertClassificationModel()
# model = model.to(device)
model = torch.load('model/model_10.pth').to(device)
learning_rate = 1e-4
optimizer=AdamW(model.parameters(),lr=learning_rate)
loss_func=nn.CrossEntropyLoss()
loss_func=loss_func.to(device)

In [None]:
def train(epoch):

    for i in range(epoch):
        print("-------第 {} 轮训练开始-------".format(i+1))
        model.train()
        for idx,(data,labels) in enumerate(tqdm(train_loader)):

            out=model(data) # [batch_size,num_class]
            loss=loss_func(out.cpu(),labels)

            # 优化器优化模型
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if idx%500==0: # 第1个batch 第500个 第1000个...
                out=out.argmax(dim=-1)
                acc=(out.cpu()==labels).sum().item()/len(labels)
                print('训练集batch{}的，损失为{}，准确率为{}'.format(idx, loss.item(), acc)) # 一个batch的数据
            # if idx%100==0: break

        model.eval()
        correct = 0
        total = 0
        for idx2,(data,labels) in enumerate(tqdm(test_loader)):  # 每轮跑下来 查看测试集的准确率
            with torch.no_grad():
                out=model(data) # [batch_size,num_class]

            out = out.argmax(dim=1)
            correct += (out.cpu() == labels).sum().item()
            total += len(labels)

        print('第{}轮次的测试集准确率为{}'.format(i+1,correct / total))

        torch.save(model, "model/model_{}.pth".format(i+1))
        print("模型已保存")



# 训练的轮数
epoch = 10
train(epoch)