In [1]:
import numpy as np
import pandas as pd
import csv
import torch.nn as nn
import torch
import transformers
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer,BertConfig,AdamW,BertModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
#读取数据
Data_path = "./JDcontents_vivo_Cleaned_data.csv"
Totle_data = pd.read_csv(Data_path)
 
train_dataset,temp_data =  train_test_split(Totle_data,test_size=0.2)
validate_dataset, test_dataset = train_test_split(temp_data,test_size=0.5)
print(len(train_dataset))
print(len(validate_dataset))
print(len(test_dataset))
 
#设置保存路径
train_data_path="./Train.csv"
dev_data_path = "./Dev.csv" 
test_data_path="./Test.csv"
 
#index参数设置为False表示不保存行索引,header设置为False表示不保存列索引
train_dataset.to_csv(train_data_path,index=False,header=True)
validate_dataset.to_csv(dev_data_path ,index=False,header=True)
test_dataset.to_csv(test_data_path,index=False,header=True)

5942
743
743


In [3]:
data = pd.read_csv(train_data_path)
data

Unnamed: 0,label,comment_processed
0,4,一直 蓝厂 手机 妈妈 四年 该换 正好 刚出 配置 更好 更新 果断 入手 买 礼盒装 几...
1,4,款 手机 颜色 太漂亮 玻璃 后盖绝 绝子 颜控 表示 喜欢 拍照 好看 电池 耐用
2,0,刚刚 买来 一个月 不到 电话 接听 不到 一天 几十个 电话 进来 手机 重要 功能 问题...
3,4,运行 速度 真的 不要 太爽 使用 起来 不会 卡顿 之前 明显 不够 外形 外观 真的 很...
4,0,辣鸡 客服 刚买 手机 降价 补 差价
...,...,...
5937,0,最差 手机 说 手机 原来 手机 网络 差 要死
5938,4,拍照 效果 刚回来 试 一下 拍照 自带 美妆 挺不错 外形 外观 外观 好看 感觉 挺值 价
5939,4,外形 外观 外观 很漂亮 喜欢 屏幕 音效 生音 很大 拍照 效果 杠杠 运行 速度 很快 ...
5940,2,屏幕 音效 整体 效果 挺 轻薄 颜色 挺 特别 性能 没用 暂时 知道 买 最高 配置 应...


In [4]:
class BertClassificationModel(nn.Module):
    def __init__(self):
        super(BertClassificationModel, self).__init__()   
        #加载预训练模型
        pretrained_weights="Bert/bert-base-chinese/"
        self.bert = BertModel.from_pretrained(pretrained_weights)
        for param in self.bert.parameters():
            param.requires_grad = True
        #定义线性函数      
        self.dense = nn.Linear(768, 2)  #bert默认的隐藏单元数是768， 输出单元是2，表示二分类
        
    def forward(self, input_ids,token_type_ids,attention_mask):
        #得到bert_output
        bert_output = self.bert(input_ids=input_ids,token_type_ids=token_type_ids, attention_mask=attention_mask)
        #获得预训练模型的输出
        bert_cls_hidden_state = bert_output[1]
        #将768维的向量输入到线性层映射为二维向量
        linear_output = self.dense(bert_cls_hidden_state)
        return  linear_output

In [5]:
def encoder(max_len,vocab_path,text_list):
    #将text_list embedding成bert模型可用的输入形式
    #加载分词模型
    bert = BertModel.from_pretrained("Bert/bert-base-chinese/")
    tokenizer = BertTokenizer.from_pretrained(vocab_path)
    
    tokenizer = tokenizer(
        text_list,
        padding = True,
        truncation = True,
        max_length = max_len,
        return_tensors='pt'  # 返回的类型为pytorch tensor
        )
    input_ids = tokenizer['input_ids']
    token_type_ids = tokenizer['token_type_ids']
    attention_mask = tokenizer['attention_mask']
    return input_ids,token_type_ids,attention_mask

In [6]:
def load_data(path):
    csvFileObj = open(path,encoding='UTF-8')
    readerObj = csv.reader(csvFileObj)
    text_list = []
    labels = []
    for row in readerObj:
        #跳过表头
        if readerObj.line_num == 1:
            continue
        #label在什么位置就改成对应的index
        label = int(row[0])
        text = row[1]
        text_list.append(text)
        labels.append(label)
    #调用encoder函数，获得预训练模型的三种输入形式
    input_ids,token_type_ids,attention_mask = encoder(max_len=150,vocab_path="Bert/vocab.txt",text_list=text_list)
    labels = torch.tensor(labels)
    #将encoder的返回值以及label封装为Tensor的形式
    data = TensorDataset(input_ids,token_type_ids,attention_mask,labels)
    return data

In [7]:
#设定batch_size
batch_size = 16
#引入数据路径
train_data_path="Train.csv"
dev_data_path="Dev.csv"
test_data_path="Test.csv"
#调用load_data函数，将数据加载为Tensor形式
train_data = load_data(train_data_path)
dev_data = load_data(dev_data_path)
test_data = load_data(test_data_path)
#将训练数据和测试数据进行DataLoader实例化
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dataset=dev_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)  



In [8]:
def dev(model,dev_loader):
    #将模型放到服务器上
    model.to(device)
#设定模式为验证模式
    model.eval()
#设定不会有梯度的改变仅作验证
    with torch.no_grad():
        correct = 0
        total = 0
        for step, (input_ids,token_type_ids,attention_mask,labels) in tqdm(enumerate(dev_loader),desc='Dev Itreation:'):                
            input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device)
            out_put = model(input_ids,token_type_ids,attention_mask)
            _, predict = torch.max(out_put.data, 1)
            correct += (predict==labels).sum().item()
            total += labels.size(0)
        res = correct / total
        return res

In [9]:
def train(model,train_loader,dev_loader) :
    #将model放到服务器上
    model.to(device)
    #设定模型的模式为训练模式
    model.train()
    #定义模型的损失函数
    criterion = nn.CrossEntropyLoss()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    #设置模型参数的权重衰减
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    #学习率的设置
    optimizer_params = {'lr': 1e-5, 'eps': 1e-6, 'correct_bias': False}
    #使用AdamW 主流优化器
    optimizer = AdamW(optimizer_grouped_parameters, **optimizer_params)
    #学习率调整器，检测准确率的状态，然后衰减学习率
    scheduler = ReduceLROnPlateau(optimizer,mode='max',factor=0.5,min_lr=1e-7, patience=5,verbose= True, threshold=0.0001, eps=1e-08)
    t_total = len(train_loader)
    #设定训练轮次
    total_epochs = 2
    bestAcc = 0
    correct = 0
    total = 0
    print('Training and verification begin!')
    for epoch in range(total_epochs): 
        for step, (input_ids,token_type_ids,attention_mask,labels) in enumerate(train_loader):
#从实例化的DataLoader中取出数据，并通过 .to(device)将数据部署到服务器上    input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device)
            #梯度清零
            optimizer.zero_grad()
            #将数据输入到模型中获得输出
            out_put =  model(input_ids,token_type_ids,attention_mask)
            #计算损失
            loss = criterion(out_put, labels)
            _, predict = torch.max(out_put.data, 1)
            correct += (predict == labels).sum().item()
            total += labels.size(0)
            loss.backward()
            optimizer.step()
             #每两步进行一次打印
            if (step + 1) % 2 == 0:
                train_acc = correct / total
                print("Train Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,loss.item()))
            #每五十次进行一次验证
            if (step + 1) % 50 == 0:
                train_acc = correct / total
                #调用验证函数dev对模型进行验证，并将有效果提升的模型进行保存
                acc = dev(model, dev_loader)
                if bestAcc < acc:
                    bestAcc = acc
                    #模型保存路径
                    path = 'savedmodel/span_bert_hide_model1.pkl'
                    torch.save(model, path)
                print("DEV Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,bestAcc{:.6f}%,dev_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,bestAcc*100,acc*100,loss.item()))
        scheduler.step(bestAcc)

In [10]:
# device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#实例化模型
model = BertClassificationModel()
#调用训练函数进行训练与验证
train(model,train_loader,dev_loader)



Training and verification begin!


IndexError: Target 4 is out of bounds.

In [None]:
def predict(model,test_loader):
    model.to(device)
    model.eval()
    predicts = []
    predict_probs = []
    with torch.no_grad():
        correct = 0
        total = 0
        for step, (input_ids,token_type_ids,attention_mask,labels) in enumerate(test_loader): 
            input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device)
            out_put = model(input_ids,token_type_ids,attention_mask)
           
            _, predict = torch.max(out_put.data, 1)
 
            pre_numpy = predict.cpu().numpy().tolist()
            predicts.extend(pre_numpy)
            probs = F.softmax(out_put).detach().cpu().numpy().tolist()
            predict_probs.extend(probs)
 
            correct += (predict==labels).sum().item()
            total += labels.size(0)
        res = correct / total
        print('predict_Accuracy : {} %'.format(100 * res))
        #返回预测结果和预测的概率
        return predicts,predict_probs

In [None]:
#引进训练好的模型进行测试
path = 'savedmodel/span_bert_hide_model.pkl'
Trained_model = torch.load(path)
#predicts是预测的（0   1），predict_probs是概率值
predicts,predict_probs = predict(Trained_model,dev_loader)

In [None]:
P = sklearn.metrics.precision_score(y_true, y_pred, average=’binary’,sample_weight=None)
R = sklearn.metrics.recall_score(y_true, y_pred, average=’binary’,sample_weight=None)
F1 = sklearn.metrics.f1_score(y_true, y_pred,average=’binary’,sample_weight=None)