In [2]:
import torch
import time
import numpy as np
import pandas as pd
import torch.nn as nn
from transformers import BertModel
from transformers import BertTokenizer,AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from gensim.models import KeyedVectors
from keras.preprocessing.sequence import pad_sequences
import re
import jieba

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

In [5]:
encoded_input = tokenizer(["Rotary Transformer，简称RoFormer，是我们自研的语言模型之一","，主要是为Transformer结构设计了新的旋转式位置编码（Rotary Position Embedding，RoPE）","。RoPE具有良好的理论性质，且是目前唯一一种可以应用到线性Attention的绝对位置编码，目前来看实验结果也颇为不错。"],max_length=107)
print(encoded_input)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': [[101, 100, 100, 8024, 5042, 4917, 100, 8024, 3221, 2769, 812, 5632, 4777, 4638, 6427, 6241, 3563, 1798, 722, 671, 102], [101, 8024, 712, 6206, 3221, 711, 100, 5310, 3354, 6392, 6369, 749, 3173, 4638, 3181, 6760, 2466, 855, 5390, 5356, 4772, 8020, 100, 100, 100, 8024, 100, 8021, 102], [101, 511, 100, 1072, 3300, 5679, 1962, 4638, 4415, 6389, 2595, 6574, 8024, 684, 3221, 4680, 1184, 1546, 671, 671, 4905, 1377, 809, 2418, 4500, 1168, 5296, 2595, 100, 4638, 5318, 2190, 855, 5390, 5356, 4772, 8024, 4680, 1184, 3341, 4692, 2141, 7741, 5310, 3362, 738, 7567, 711, 679, 7231, 511, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [6]:
train_texts_orig = []
neg=[]
pos=[]
with open('neg.txt','r',encoding='utf-8') as reader:
    for row in reader:
        neg.append(row)
data_neg=pd.DataFrame(np.array(neg),columns=['comment'])
data_neg['label']=0
print(data_neg)
with open('pos.txt','r',encoding='utf-8') as reader:
    for row in reader:
        pos.append(row)
data_pos=pd.DataFrame(np.array(pos),columns=['comment'])
data_pos['label']=1
print(data_pos)

                                                 comment  label
0      [衰][衰][衰]回复@ciciwithyou:没事啦，来得及的，搞个小抽奖，准备一两百套衣...      0
1      【#超级笑星# 精彩剧照】[哈哈]我们超级可爱的小球童在现场人气也很高哦~[嘻嘻]不仅有@主...      0
2      1、和移动的“梦网”服务的联系太紧密了，到处都是“移动梦网”的快捷键。2、不支持自动开机，不...      0
3      关注[哈哈]记者就是记者，眼神就是好@北京菠菜 太恐怖了 抱考拉摸袋鼠，又完成了2个心愿，好...      0
4      见过悲催的，没见过这么悲催的。大晚上等公车半个多小时才来；快到家了才想起来没带钥匙。只好乖乖...      0
...                                                  ...    ...
42959    以前刚有了孩子看过一本卡尔威特的教育，感觉要比这本书好多了，我翻了个大概，就扔到一边去了。\n      0
42960             谁知道燕麦粥用什么办法能煮成像酒店那样啊？最近想喝可是做出来很失败[晕]\n      0
42961  充话费得的...现在父母的想象力远远超过我们的父母啊?我小时候说是路边捡的，害我每次被妈妈骂...      0
42962           发货找货包装和快递蛋逼这事比大便干燥还难受憋屈！[泪]@代购JOaNLOLo\n      0
42963   现在风越来越大了…[抓狂] 泪！ 北京出租车是北京城市的耻辱！宰客，拒载，乱象丛生！！[怒]\n      0

[42964 rows x 2 columns]
                                                 comment  label
0      买书前在网上看完了开放的部分，心里念念不忘，每看到书名都能引起联想，放不下的一本书，一般网络...      1
1      penny是骰子，一场麻将下来，人人都有至少抓一把的机会最爱《盛夏光年》@Chloe的呼吸 ...      1
2             

In [7]:
data=pd.concat([data_pos[:100],data_neg[:100]], axis=0).reset_index(drop=True)
print(data)

                                               comment  label
0    买书前在网上看完了开放的部分，心里念念不忘，每看到书名都能引起联想，放不下的一本书，一般网络...      1
1    penny是骰子，一场麻将下来，人人都有至少抓一把的机会最爱《盛夏光年》@Chloe的呼吸 ...      1
2                                 幸福如此简单[爱你][爱你][爱你]\n      1
3    谢谢莫莫 我就怕你戴会小，刚刚好我就放心啦...美美的过年吧...蜜蜜小姐新年快乐思密达.....      1
4    回复@MMaiH:这个是花一晚上的钱，住两晚上！[哈哈] 回复@磨西春天酒吧:好的哈[哈哈]...      1
..                                                 ...    ...
195  我怀疑脑子不正常，说到后面电话都要起来了[抓狂] [抓狂][抓狂] 某人不知道在哪儿看的偏方...      0
196                        领导那边比基尼，洛阳现在Moncle羽绒服吖[晕]\n      0
197  突如其来的疼痛！Hold不住了我！[抓狂][泪][生病] 我在:http:俺44-56[晕]...      0
198  困死了，，早自习进行中……[泪][泪][泪]诶，为什么大一的孩子那么惨……[左哼哼][右哼哼...      0
199                                      又坚强，又心酸 [泪]\n      0

[200 rows x 2 columns]


In [8]:
X = data.comment.values  # comment
y = data.label.values  # label自己给的0 1 2
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.1)

In [9]:
def preprocessing_for_bert(data):
    # 空列表来储存信息
    input_ids = []
    attention_masks = []

    # 每个句子循环一次
    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text=sent,  # 预处理语句
            add_special_tokens=True,  # 加 [CLS] 和 [SEP]
            max_length=MAX_LEN,  # 截断或者填充的最大长度
            padding='max_length',  # 填充为最大长度，这里的padding在之间可以直接用pad_to_max但是版本更新之后弃用了，老版本什么都没有，可以尝试用extend方法
            return_attention_mask=True  # 返回 attention mask
        )

        # 把输出加到列表里面
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # 把list转换为tensor
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [10]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [11]:
encoded_comment = [tokenizer.encode(sent, add_special_tokens=True) for sent in data.comment.values]


In [12]:
MAX_LEN = max([len(sent) for sent in encoded_comment])
train_inputs, train_masks = preprocessing_for_bert(X_train)
test_inputs, test_masks = preprocessing_for_bert(X_test)

In [13]:
train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)

In [14]:
batch_size = 32

# 给训练集创建 DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# print(train_dataloader)

# 给验证集创建 DataLoader
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [23]:
class BertClassifier(nn.Module):
    def __init__(self, ):
        super(BertClassifier, self).__init__()
        # 输入维度(hidden size of Bert)默认768，分类器隐藏维度，输出维度(label)
        D_in, H, D_out = 768, 100, 2

        # 实体化Bert模型
        self.bert = BertModel.from_pretrained('bert-base-chinese')

        # 实体化一个单层前馈分类器，说白了就是最后要输出的时候搞个全连接层
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),  # 全连接
            nn.ReLU(),  # 激活函数
            nn.Linear(H, D_out)  # 全连接
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        # 为分类任务提取标记[CLS]的最后隐藏状态，因为要连接传到全连接层去
        print(outputs)
        last_hidden_state_cls = outputs[0][:, 0, :]
        # 全连接，计算，输出label
        logits = self.classifier(last_hidden_state_cls)
        print(logits)
        return logits


In [16]:
def initialize_model(epochs=2):
    """
    初始化我们的bert，优化器还有学习率，epochs就是训练次数
    """
    # 初始化我们的Bert分类器
    bert_classifier = BertClassifier()
    # 用GPU运算
    bert_classifier.to(device)
    # 创建优化器
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,  # 默认学习率
                      eps=1e-8  # 默认精度
                      )
    # 训练的总步数
    total_steps = len(train_dataloader) * epochs
    # 学习率预热
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,  # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler


In [17]:
loss_fn = nn.CrossEntropyLoss()  # 交叉熵


In [18]:
def train(model, train_dataloader, test_dataloader=None, epochs=2, evaluation=False):
    # 开始训练循环
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # 表头
        print(f"{'Epoch':^7} | {'每40个Batch':^9} | {'训练集 Loss':^12} | {'测试集 Loss':^10} | {'测试集准确率':^9} | {'时间':^9}")
        print("-" * 80)

        # 测量每个epoch经过的时间
        t0_epoch, t0_batch = time.time(), time.time()

        # 在每个epoch开始时重置跟踪变量
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # 把model放到训练模式
        model.train()

        # 分batch训练
        for step, batch in enumerate(train_dataloader):
            batch_counts += 1
            # 把batch加载到GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
            #print(b_labels.shape)
            # 归零导数
            model.zero_grad()
            # 真正的训练
            logits = model(b_input_ids, b_attn_mask)
            #print(logits.shape)
            # 计算loss并且累加

            loss = loss_fn(logits, b_labels)

            batch_loss += loss.item()
            total_loss += loss.item()
            # 反向传播
            loss.backward()
            # 归一化，防止梯度爆炸
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # 更新参数和学习率
            optimizer.step()
            scheduler.step()

            # Print每40个batch的loss和time
            if (step % 40 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # 计算40个batch的时间
                time_elapsed = time.time() - t0_batch

                # Print训练结果
                print(
                    f"{epoch_i + 1:^7} | {step:^10} | {batch_loss / batch_counts:^14.6f} | {'-':^12} | {'-':^13} | {time_elapsed:^9.2f}")

                # 重置batch参数
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # 计算平均loss 这个是训练集的loss
        avg_train_loss = total_loss / len(train_dataloader)

        print("-" * 80)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation:  # 这个evalution是我们自己给的，用来判断是否需要我们汇总评估
            # 每个epoch之后评估一下性能
            # 在我们的验证集/测试集上.
            test_loss, test_accuracy = evaluate(model, test_dataloader)
            # Print 整个训练集的耗时
            time_elapsed = time.time() - t0_epoch

            print(
                f"{epoch_i + 1:^7} | {'-':^10} | {avg_train_loss:^14.6f} | {test_loss:^12.6f} | {test_accuracy:^12.2f}% | {time_elapsed:^9.2f}")
            print("-" * 80)
        print("\n")


In [19]:
def evaluate(model, test_dataloader):
    """
    在每个epoch后验证集上评估model性能
    """
    # model放入评估模式
    model.eval()

    # 准确率和误差
    test_accuracy = []
    test_loss = []

    # 验证集上的每个batch
    for batch in test_dataloader:
        # 放到GPU上
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # 计算结果，不计算梯度
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)  # 放到model里面去跑，返回验证集的ouput就是一行三列的
            # label向量可能性，这个时候还没有归一化所以还不能说是可能性，反正归一化之后最大的就是了

        # 计算误差
        loss = loss_fn(logits, b_labels.long())
        test_loss.append(loss.item())

        # get预测结果，这里就是求每行最大的索引咯，然后用flatten打平成一维
        preds = torch.argmax(logits, dim=1).flatten()  # 返回一行中最大值的序号

        # 计算准确率，这个就是俩比较，返回相同的个数, .cpu().numpy()就是把tensor从显卡上取出来然后转化为numpy类型的举证好用方法
        # 最后mean因为直接bool形了，也就是如果预测和label一样那就返回1，正好是正确的个数，求平均就是准确率了
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        test_accuracy.append(accuracy)

    # 计算整体的平均正确率和loss
    val_loss = np.mean(test_loss)
    val_accuracy = np.mean(test_accuracy)

    return val_loss, val_accuracy


In [20]:
bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
# print("Start training and validation:\n")
print("Start training and testing:\n")
train(bert_classifier, train_dataloader, test_dataloader, epochs=2, evaluation=True)  # 这个是有评估的


net = BertClassifier()
print("Total number of paramerters in networks is {}  ".format(sum(x.numel() for x in net.parameters())))

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training and testing:

 Epoch  | 每40个Batch |   训练集 Loss   |  测试集 Loss  |  测试集准确率   |    时间    
--------------------------------------------------------------------------------
   1    |     5      |    0.609848    |      -       |       -       |  622.96  
--------------------------------------------------------------------------------
   1    |     -      |    0.609848    |   0.424712   |    90.00    % |  650.13  
--------------------------------------------------------------------------------


 Epoch  | 每40个Batch |   训练集 Loss   |  测试集 Loss  |  测试集准确率   |    时间    
--------------------------------------------------------------------------------
   2    |     5      |    0.288218    |      -       |       -       |  577.71  
--------------------------------------------------------------------------------
   2    |     -      |    0.288218    |   0.400553   |    80.00    % |  605.35  
--------------------------------------------------------------------------------


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Total number of paramerters in networks is 102344750  


In [24]:
input_ids,attention_mask=preprocessing_for_bert("Rotary Transformer，简称RoFormer，是我们自研的语言模型之一")
bert_classifier.forward(input_ids,attention_mask)


tensor([[ 0.4093, -0.4306],
        [ 0.5259, -0.4262],
        [ 0.1492, -0.2210],
        [ 0.2906, -0.3334],
        [ 0.3153, -0.4795],
        [ 0.3258, -0.2974],
        [ 0.1630,  0.0341],
        [ 0.4093, -0.4306],
        [ 0.3153, -0.4795],
        [ 0.2906, -0.3334],
        [ 0.2170, -0.1598],
        [ 0.2492, -0.2989],
        [ 0.1096, -0.2520],
        [ 0.5259, -0.4262],
        [ 0.3153, -0.4795],
        [ 0.0818, -0.1715],
        [ 0.2909, -0.4498],
        [ 0.3153, -0.4795],
        [ 0.2359, -0.1729],
        [ 0.0022, -0.0641],
        [ 0.1837, -0.1641],
        [ 0.4093, -0.4306],
        [ 0.5259, -0.4262],
        [ 0.4093, -0.4306],
        [ 0.5259, -0.4262],
        [ 0.3153, -0.4795],
        [ 0.0818, -0.1715],
        [ 0.2909, -0.4498],
        [ 0.3153, -0.4795],
        [ 0.2359, -0.1729],
        [ 0.4051, -0.3189],
        [ 0.1613, -0.1567],
        [ 0.1889, -0.3006],
        [ 0.2185, -0.0952],
        [ 0.1476, -0.0853],
        [ 0.3015, -0