In [23]:
import torch
import logging
import os

# 初始化log
logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")

if not os.path.exists('./log/'):
    os.mkdir('./log/')
fh = logging.FileHandler('./log/log.log', mode='a', encoding='utf-8')
fh.setFormatter(formatter)
console = logging.StreamHandler()
console.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s: %(message)s"))
console.setLevel(logging.INFO)
logger.addHandler(fh)
logger.addHandler(console)

# 各项参数设置
dataset = 'data/THUCnews'
labels_name = [w.strip() for w in open(os.path.join(dataset, 'class.txt'), 'r', encoding='utf8').readlines()]
pad_size = 32
device = torch.device('cpu')  # cpu
# device = torch.device('cuda')  # 设备gpu
batch_size = 4
bert_path = 'bert-base-chinese'
num_classes = len(labels_name)
weight_decay = 0.02
num_epochs = 30
learning_rate = 5e-5
bert_lr_ratio = 0.2
dropout = 0.1
patience = 6
save_path = 'saved_dict/bert.ckpt' # 模型保存地址


In [24]:
from tqdm import tqdm
import os


# 读取数据集
def data_loader(file_path):
    contents = []
    with open(file_path, 'r', encoding='UTF-8') as f:
        for line in tqdm(f):
            lin = line.strip()
            if not lin:
                continue
            content, label = lin.split('\t')
            contents.append((content, label))
    return contents


train_data = data_loader(os.path.join(dataset, 'train.txt'))
dev_data = data_loader(os.path.join(dataset, 'dev.txt'))
test_data = data_loader(os.path.join(dataset, 'test.txt'))

print(test_data[:5])
print(labels_name)

28000it [00:00, 925216.55it/s]
7000it [00:00, 957041.79it/s]
7000it [00:00, 1034280.76it/s]

[('中国春节推涨全球黄金市场', '13'), ('韩网游企业纷纷进军Facebook游戏领域', '9'), ('跟降热盘 新里西斯莱公馆精装3-4居19000起', '4'), ('组图：《网球王子2》上海开机 张德培加盟演出', '1'), ('资金爆炒 郑糖开始宽幅振荡', '13')]
['体育', '娱乐', '家居', '彩票', '房产', '教育', '时尚', '时政', '星座', '游戏', '社会', '科技', '股票', '财经']





In [25]:
from transformers import BertTokenizer, BertModel,AutoModel,AutoTokenizer

# 数据encode
def encode_data(tokenizer, data):
    data_encoded = []
    for text, label in tqdm(data, total=len(data)):
        inputs = tokenizer.encode(text=text, max_length=pad_size, truncation=True,
                                  truncation_strategy='longest_first',
                                  add_special_tokens=True, pad_to_max_length=True)
        data_encoded.append((inputs, [1 if x != 0 else 0 for x in inputs], int(label)))
    return data_encoded


tokenizer = BertTokenizer.from_pretrained(bert_path)

train_data = encode_data(tokenizer, train_data)
dev_data = encode_data(tokenizer, dev_data)
test_data = encode_data(tokenizer, test_data)

print(train_data[0])

100%|██████████| 28000/28000 [00:03<00:00, 7685.28it/s]
100%|██████████| 7000/7000 [00:00<00:00, 7636.90it/s]
100%|██████████| 7000/7000 [00:01<00:00, 5916.89it/s]

([101, 8170, 7770, 5440, 2562, 2703, 1856, 2845, 8038, 6237, 6438, 2218, 689, 3297, 1391, 7676, 4638, 758, 1920, 683, 689, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 5)





In [26]:
from torch.utils.data import TensorDataset, DataLoader
import torch
# 迭代器包装
train_dataset = TensorDataset(*[torch.LongTensor(x).to(device) for x in zip(*train_data)])
dev_dataset = TensorDataset(*[torch.LongTensor(x).to(device) for x in zip(*dev_data)])
test_dataset = TensorDataset(*[torch.LongTensor(x).to(device) for x in zip(*test_data)])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
test_iter = iter(test_loader)
print(next(test_iter))

[tensor([[ 101,  704, 1744, 3217, 5688, 2972, 3885, 1059, 4413, 7942, 7032, 2356,
         1767,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 7506, 5381, 3952,  821,  689, 5290, 5290, 6822, 1092,  100, 3952,
         2767, 7566, 1818,  102,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 6656, 7360, 4178, 4669, 3173, 7027, 6205, 3172, 5812, 1062, 7667,
         5125, 6163,  124,  118,  125, 2233, 8985, 8129, 6629,  102,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 5299, 1745, 8038,  517, 5381, 4413, 4374, 2094,  123,  518,  677,
         3862, 2458, 3322, 2476, 2548, 1824, 1217, 4673, 4028, 1139,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
  

In [27]:
import torch.nn as nn

# 模型网络搭建
class Model(nn.Module):
    def __init__(self, bert_path, hidden_size, num_classes):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(bert_path) # 加载预训练模型
        for param in self.bert.parameters():
            param.requires_grad = True # 在训练中更新bert预训练模型的权重
        self.fc = nn.Linear(hidden_size, num_classes) # 全连接层分类

    def forward(self, x):
        context = x[0]  # 输入的句子
        mask = x[1]  # 对padding部分进行mask，和句子一个size，padding部分用0表示，如：[1, 1, 1, 1, 0, 0]
        _, pooled = self.bert(context, attention_mask=mask, return_dict=False)
        out = self.fc(pooled)
        return out


# 初始化模型
model = Model(bert_path, 768, num_classes).to(device)
print(model.parameters)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<bound method Module.parameters of Model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,),

In [28]:
from transformers import AdamW

def get_optimizer(model):
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    bert_param_ids = list(map(id, param_optimizer))
    no_weight_decay_params = [x[1] for x in filter(
        lambda name_w: any(nwd in name_w[0] for nwd in no_decay), model.named_parameters())]
    no_weight_decay_param_ids = list(map(id, [x[1] for x in no_weight_decay_params]))
    bert_base_params = filter(lambda p: id(p) in bert_param_ids and id(p) not in no_weight_decay_param_ids,
                              model.parameters())
    bert_no_weight_decay_params = filter(lambda p: id(p) in bert_param_ids and id(p) in no_weight_decay_param_ids,
                                         model.parameters())
    base_no_weight_decay_params = filter(
        lambda p: id(p) not in bert_param_ids and id(p) in no_weight_decay_param_ids,
        model.parameters())
    base_params = filter(lambda p: id(p) not in bert_param_ids and id(p) not in no_weight_decay_param_ids,
                         model.parameters())
    params = [{"params": bert_base_params, "lr": learning_rate * bert_lr_ratio},
              {"params": bert_no_weight_decay_params, "lr": learning_rate * bert_lr_ratio,
               "weight_decay": 0.0},
              {"params": base_no_weight_decay_params, "lr": learning_rate, "weight_decay": 0.0},
              {"params": base_params, "lr": learning_rate}]

    # 设置AdamW优化器
    optimizer = AdamW(params, lr=learning_rate, weight_decay=weight_decay)

    return optimizer


optimizer = get_optimizer(model)




In [29]:
import numpy as np
import torch.nn.functional as F
from sklearn import metrics
from sklearn.metrics import f1_score


def evaluate(model, data_iter):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for step, batch in tqdm(data_iter):
            batch = [x.to(device) for x in batch]
            outputs = model((batch[0], batch[1]))
            # print(labels)
            loss = F.cross_entropy(outputs, batch[-1])
            loss_total += loss
            labels = batch[-1].data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)

    acc = metrics.accuracy_score(labels_all, predict_all)
    f1 = f1_score(labels_all, predict_all, average='macro')
    report = metrics.classification_report(labels_all, predict_all, target_names=labels_name, digits=4)
    confusion = metrics.confusion_matrix(labels_all, predict_all)

    return acc, f1, loss_total / (len(data_iter) + 1e-10), report, confusion


def test(model, test_iter):
    # test
    model.load_state_dict(torch.load(save_path))
    model.eval()
    test_acc, test_f1, test_loss, test_report, test_confusion = evaluate(model, test_iter)
    msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}, Test F1:{2:>6.2%}'
    logger.info(msg.format(test_loss, test_acc, test_f1))
    logger.info("Precision, Recall and F1-Score...")
    logger.info(test_report)
    logger.info("Confusion Matrix...")
    logger.info(test_confusion)


def train(model, train_iter, dev_iter, test_iter, optimizer):
    model.train()
    dev_best_f1 = float('-inf')
    last_improve_epoch = 0
    model.train()
    for epoch in range(num_epochs):
        logger.info('Epoch [{}/{}]'.format(epoch + 1, num_epochs))

        # 记录变量
        train_labels_all = np.array([], dtype=int)
        train_predicts_all = np.array([], dtype=int)
        train_loss_list = []
        t = tqdm(train_iter, leave=False, total=len(train_iter), desc='Training')
        for step, batch in enumerate(t):
            batch = [x.to(device) for x in batch]
            model.train()
            model.zero_grad()
            outputs = model((batch[0], batch[1]))
            loss = F.cross_entropy(outputs, batch[-1])
            train_loss_list.append(loss.item())
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # 梯度裁剪
            optimizer.step()
            # 真实标签和预测标签

            predicts = torch.max(outputs.data, 1)[1].cpu()
            labels_train = batch[-1].cpu().data.numpy()
            train_labels_all = np.append(train_labels_all, labels_train)
            train_predicts_all = np.append(train_predicts_all, predicts)

        # 训练集评估
        train_loss = sum(train_loss_list) / (len(train_loss_list) + 1e-10)
        train_acc = metrics.accuracy_score(train_labels_all, train_predicts_all)
        train_f1 = metrics.f1_score(train_labels_all, train_predicts_all, average='macro')

        dev_acc, dev_f1, dev_loss, report, confusion = evaluate(model, dev_iter)
        msg = 'Train Loss: {0:>5.6},  Train Acc: {1:>6.4%},  Train F1: {2:>6.4%},  Val Loss: {3:>5.4},  Val Acc: {4:>6.4%},  Val F1: {5:>6.4%}'
        logger.info(msg.format(train_loss, train_acc, train_f1, dev_loss, dev_acc, dev_f1))
        logger.info("Precision, Recall and F1-Score...")
        logger.info(report)
        logger.info("Confusion Matrix...")
        logger.info(confusion)

        if dev_f1 > dev_best_f1:
            dev_best_f1 = dev_f1
            torch.save(model.state_dict(), save_path)
            last_improve_epoch = epoch

        if epoch - last_improve_epoch > patience:
            logger.info("No optimization for a long time, auto-stopping...")
            break

    test(model, test_iter)


train(model,train_loader,dev_loader,test_loader,optimizer)

2023-04-21 14:21:29,966 - INFO: Epoch [1/30]
                                                            

KeyboardInterrupt: 