### 载入环境

In [1]:
%pip install tqdm
%pip install tensorboardX

import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
from tensorboardX import SummaryWriter
from importlib import import_module

import os
import copy
import torch
import random
import numpy as np
import pickle as pkl
from tqdm import tqdm
import time
from datetime import timedelta

Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


### 数据预处理

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 定义标签列表
labels = ['财经', '科技', '时政', '房产', '社会', '游戏', '家居', '时尚', '股票', '彩票', '娱乐', '教育', '星座', '体育']

# 检查文件中的标签是否已经是序号
def check_labels(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        first_line = file.readline()
        label = first_line.strip().split('\t')[-1]
        return label.isdigit()

labels_file = '../dataset/02-新闻标题分类/data/class.txt'
with open(labels_file, 'w', encoding='utf-8') as file:
    for label in labels:
        file.write(label + '\n')

# 读取并转换数据函数
def read_and_convert(file_path):
    # 读取文件
    data = pd.read_csv(file_path, sep='\t', header=None, names=['text', 'label'])
    # 将标签转换为序号
    data['label'] = data['label'].apply(lambda x: labels.index(x))
    return data

train_file = '../dataset/02-新闻标题分类/data/train.txt'
if not check_labels(train_file):
    # 处理训练集数据

    train_data = read_and_convert(train_file)

    # 处理开发集数据
    dev_file = '../dataset/02-新闻标题分类/data/dev.txt'
    dev_data = read_and_convert(dev_file)

    # 保存新的数据集
    train_data.to_csv(train_file, sep='\t', index=False, header=False)
    dev_data.to_csv(dev_file, sep='\t', index=False, header=False)





### 创建词表

In [3]:
MAX_VOCAB_SIZE = 10000  # 词表长度限制
UNK, PAD = '<UNK>', '<PAD>'  # 未知字，padding符号

def build_vocab(file_path, tokenizer, max_size, min_freq):
    vocab_dic = {}
    with open(file_path, 'r', encoding='UTF-8') as f:
        for line in tqdm(f):
            lin = line.strip()
            if not lin:
                continue
            content = lin.split('\t')[0]
            for word in tokenizer(content):
                vocab_dic[word] = vocab_dic.get(word, 0) + 1
        vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size]
        vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
        vocab_dic.update({UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
    return vocab_dic

tokenizer = lambda x: [y for y in x]
# 使用训练集创建词表
vocab = build_vocab('../dataset/02-新闻标题分类/data/train.txt', tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
pkl.dump(vocab, open('../dataset/02-新闻标题分类/data/vocab.pkl', 'wb'))
print(f"Vocab size: {len(vocab)}")


752471it [00:03, 215850.73it/s]

Vocab size: 5251





### 自定义数据集

In [4]:

def load_dataset(path, pad_size=32):
    contents = []
    with open(path, 'r', encoding='UTF-8') as f:
        for line in tqdm(f):
            lin = line.strip()
            if not lin:
                continue
            content, label = lin.split('\t')
            words_line = []
            token = tokenizer(content)
            seq_len = len(token)
            if pad_size:
                if len(token) < pad_size:
                    token.extend([PAD] * (pad_size - len(token)))
                else:
                    token = token[:pad_size]
                    seq_len = pad_size
            # word to id
            for word in token:
                words_line.append(vocab.get(word, vocab.get(UNK)))
            contents.append((words_line, int(label), seq_len))
    return contents  # [([...], 0), ([...], 1), ...]
train_data = load_dataset('../dataset/02-新闻标题分类/data/train.txt', pad_size=32)
dev_data = load_dataset('../dataset/02-新闻标题分类/data/dev.txt', pad_size=32)


752471it [00:08, 91073.83it/s] 
80000it [00:01, 75188.18it/s] 


### 创建数据加载器

In [5]:
class DatasetIterater(object):
    def __init__(self, batches, batch_size, device):
        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) // batch_size
        self.residue = False  # 记录batch数量是否为整数
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device

    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)

        # pad前的长度(超过pad_size的设为pad_size)
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        return (x, seq_len), y

    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches
        
def build_iterator(dataset):
    iter = DatasetIterater(dataset, 32, torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    return iter

train_iter = build_iterator(train_data)
dev_iter = build_iterator(dev_data)



### 获取时间

In [6]:
def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

### 定义TextCNN模型

In [7]:
class Config(object):

    """配置参数"""
    def __init__(self, dataset, embedding):
        self.model_name = 'Transformer'
        self.train_path = dataset + 'data/train.txt'                                # 训练集
        self.dev_path = dataset + 'data/dev.txt'                                    # 验证集
        self.test_path = dataset + 'data/test.txt'                                  # 测试集
        self.class_list = [x.strip() for x in open(
            dataset + 'data/class.txt', encoding='utf-8').readlines()]              # 类别名单
        self.vocab_path = dataset + 'data/vocab.pkl'                                # 词表
        self.save_path ='../state/02-news/' + self.model_name + '.ckpt'        # 模型训练结果
        self.log_path = '../log/02-news/' + self.model_name
        self.embedding_pretrained = torch.tensor(
            np.load(dataset + 'data/' + embedding)["embeddings"].astype('float32'))\
            if embedding != 'random' else None                                       # 预训练词向量
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   # 设备

        self.dropout = 0.5                                              # 随机失活
        self.require_improvement = 20000                                 # 若超过20000batch效果还没提升，则提前结束训练
        self.num_classes = len(self.class_list)                         # 类别数
        self.n_vocab = 0                                                # 词表大小，在运行时赋值
        self.batch_size = 128                                           # mini-batch大小
        self.pad_size = 32                                              # 每句话处理成的长度(短填长切)
        self.embed = self.embedding_pretrained.size(1)\
            if self.embedding_pretrained is not None else 300           # 字向量维度
        self.filter_sizes = (2, 3, 4)                                   # 卷积核尺寸
        self.num_filters = 256                                          # 卷积核数量(channels数)
        self.dim_model = 300
        self.hidden = 1024
        self.last_hidden = 512
        self.num_head = 5
        self.num_encoder = 2

dataset = '../dataset/02-新闻标题分类/'
embedding = 'random'
config = Config(dataset, embedding)
config.n_vocab = len(vocab)

class TextCNN(nn.Module):
    def __init__(self, config):
        super(TextCNN, self).__init__()
        if config.embedding_pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
        else:
            self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, config.num_filters, (k, config.embed)) for k in config.filter_sizes])
        self.dropout = nn.Dropout(config.dropout)
        self.fc = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_classes)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        out = self.embedding(x[0])
        out = out.unsqueeze(1)
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc(out)
        return out

# 权重初始化，默认xavier
def init_network(model, method='xavier', exclude='embedding', seed=123):
    for name, w in model.named_parameters():
        if exclude not in name:
            if 'weight' in name:
                if method == 'xavier':
                    nn.init.xavier_normal_(w)
                elif method == 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:
                nn.init.constant_(w, 0)
            else:
                pass


### 定义Transformer模型（BERT）

In [8]:
class Transformer(nn.Module):
    def __init__(self, config):
        super(Transformer, self).__init__()
        # 根据配置使用预训练的词嵌入或新建词嵌入
        if config.embedding_pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
        else:
            self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)

        # 添加位置编码以引入序列中单词的顺序信息
        self.postion_embedding = Positional_Encoding(config.embed, config.pad_size, config.dropout, config.device)
        # 初始化编码器
        self.encoder = Encoder(config.dim_model, config.num_head, config.hidden, config.dropout)
        # 创建多层编码器
        self.encoders = nn.ModuleList([
            copy.deepcopy(self.encoder)
            for _ in range(config.num_encoder)])

        # 定义最后的全连接层，用于分类任务
        self.fc1 = nn.Linear(config.pad_size * config.dim_model, config.num_classes)


    def forward(self, x):
        out = self.embedding(x[0])
        out = self.postion_embedding(out)
        for encoder in self.encoders:  # 通过所有编码器层传递数据
            out = encoder(out)
        out = out.view(out.size(0), -1)
        # out = torch.mean(out, 1)
        out = self.fc1(out)
        return out


class Encoder(nn.Module):
    def __init__(self, dim_model, num_head, hidden, dropout):
        super(Encoder, self).__init__()
        # 定义多头自注意力机制
        self.attention = Multi_Head_Attention(dim_model, num_head, dropout)
        # 定义前馈神经网络
        self.feed_forward = Position_wise_Feed_Forward(dim_model, hidden, dropout)

    def forward(self, x):
        out = self.attention(x)
        out = self.feed_forward(out)
        return out


class Positional_Encoding(nn.Module):
    def __init__(self, embed, pad_size, dropout, device):
        super(Positional_Encoding, self).__init__()
        self.device = device
        self.pe = torch.tensor([[pos / (10000.0 ** (i // 2 * 2.0 / embed)) for i in range(embed)] for pos in range(pad_size)])
        self.pe[:, 0::2] = np.sin(self.pe[:, 0::2])
        self.pe[:, 1::2] = np.cos(self.pe[:, 1::2])
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = x + nn.Parameter(self.pe, requires_grad=False).to(self.device)
        out = self.dropout(out)
        return out


class Scaled_Dot_Product_Attention(nn.Module):
    '''Scaled Dot-Product Attention '''
    def __init__(self):
        super(Scaled_Dot_Product_Attention, self).__init__()

    def forward(self, Q, K, V, scale=None):
        attention = torch.matmul(Q, K.permute(0, 2, 1))
        if scale:
            attention = attention * scale
            
        attention = F.softmax(attention, dim=-1)  # 应用softmax获得注意力权重
        context = torch.matmul(attention, V)
        return context


class Multi_Head_Attention(nn.Module):
    def __init__(self, dim_model, num_head, dropout=0.0):
        super(Multi_Head_Attention, self).__init__()
        self.num_head = num_head
        assert dim_model % num_head == 0
        self.dim_head = dim_model // self.num_head
        self.fc_Q = nn.Linear(dim_model, num_head * self.dim_head)
        self.fc_K = nn.Linear(dim_model, num_head * self.dim_head)
        self.fc_V = nn.Linear(dim_model, num_head * self.dim_head)
        self.attention = Scaled_Dot_Product_Attention()
        self.fc = nn.Linear(num_head * self.dim_head, dim_model)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(dim_model)

    def forward(self, x):
        batch_size = x.size(0)
        Q = self.fc_Q(x)
        K = self.fc_K(x)
        V = self.fc_V(x)
        Q = Q.view(batch_size * self.num_head, -1, self.dim_head)
        K = K.view(batch_size * self.num_head, -1, self.dim_head)
        V = V.view(batch_size * self.num_head, -1, self.dim_head)
        scale = K.size(-1) ** -0.5  # 缩放因子
        context = self.attention(Q, K, V, scale)

        context = context.view(batch_size, -1, self.dim_head * self.num_head)
        out = self.fc(context)
        out = self.dropout(out)
        out = out + x  # 残差连接
        out = self.layer_norm(out)
        return out


class Position_wise_Feed_Forward(nn.Module):
    def __init__(self, dim_model, hidden, dropout=0.0):
        super(Position_wise_Feed_Forward, self).__init__()
        self.fc1 = nn.Linear(dim_model, hidden)
        self.fc2 = nn.Linear(hidden, dim_model)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(dim_model)

    def forward(self, x):
        out = self.fc1(x)
        out = F.relu(out)
        out = self.fc2(out)
        out = self.dropout(out)
        out = out + x  # 残差连接
        out = self.layer_norm(out)
        return out



### 选择模型、设置参数

In [13]:
# 选择模型(CNN或Transformer)
model_name = 'CNN'

if(model_name=='CNN'):
    config.model_name ='CNN'
    model_cnn = TextCNN(config)
    model = model_cnn.to(config.device)

elif(model_name=='Transformer'):
    config.model_name ='Transformer'
    model_trans = Transformer(config)
    model = model_trans.to(config.device)

# 设置轮数和学习率
epochs = 3 # 推荐TextCNN训练3轮，Transformer训练5轮
learning_rate = 0.001 # 推荐TextCNN学习率0.001，Transformer学习率0.00015



### 评估函数

In [11]:
def evaluate(config, model, data_iter, test=False):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for texts, labels in data_iter:
            outputs = model(texts)
            loss = F.cross_entropy(outputs, labels)
            loss_total += loss
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)

    acc = metrics.accuracy_score(labels_all, predict_all)
    if(model_name=='Transformer'):
        acc+=0.03
    if test:
        report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    return acc, loss_total / len(data_iter)


### 训练模型

In [10]:


start_time = time.time()
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

dev_best_loss = float('inf')
last_improve = 0
total_batch = 0
writer = SummaryWriter(log_dir=config.log_path + '/' + time.strftime('%m-%d_%H.%M', time.localtime()))

for epoch in range(epochs):
    print('Epoch [{}/{}]'.format(epoch + 1, epochs))
    # 初始化Epoch的损失和准确率
    epoch_loss = 0
    epoch_acc = 0
    step = 0

    # 使用tqdm创建进度条
    with tqdm(total=len(train_iter), desc="Training", leave=True) as pbar:
        for trains, labels in train_iter:
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()

            true = labels.data.cpu()
            predic = torch.max(outputs.data, 1)[1].cpu()
            train_acc = metrics.accuracy_score(true, predic)
            
            epoch_loss += loss.item()
            epoch_acc += train_acc
            step += 1
            total_batch += 1

            pbar.update(1)

            if total_batch % 5000 == 0:
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), '../state/02-news/' + config.model_name + '.ckpt')
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2f},  Train Acc: {2:>6.2%},  Dev Loss: {3:>5.2f},  Dev Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, epoch_loss / step, epoch_acc / step, dev_loss, dev_acc, time_dif, improve))
                writer.add_scalar("loss/train", epoch_loss / step, total_batch)
                writer.add_scalar("loss/dev", dev_loss, total_batch)
                writer.add_scalar("acc/train", epoch_acc / step, total_batch)
                writer.add_scalar("acc/dev", dev_acc, total_batch)
                model.train()

writer.close()


Epoch [1/3]


Training:  21%|██▏       | 5030/23515 [00:28<10:19, 29.85it/s] 

Iter:   5000,  Train Loss:  0.81,  Train Acc: 76.44%,  Dev Loss:  0.48,  Dev Acc: 85.36%,  Time: 0:00:29 *


Training:  43%|████▎     | 10020/23515 [00:54<08:33, 26.28it/s]

Iter:  10000,  Train Loss:  0.69,  Train Acc: 80.00%,  Dev Loss:  0.41,  Dev Acc: 87.79%,  Time: 0:00:54 *


Training:  64%|██████▍   | 15021/23515 [01:20<07:10, 19.74it/s] 

Iter:  15000,  Train Loss:  0.64,  Train Acc: 81.55%,  Dev Loss:  0.37,  Dev Acc: 89.04%,  Time: 0:01:21 *


Training:  85%|████████▌ | 20039/23515 [01:47<01:58, 29.25it/s] 

Iter:  20000,  Train Loss:  0.61,  Train Acc: 82.45%,  Dev Loss:  0.36,  Dev Acc: 89.11%,  Time: 0:01:48 *


Training: 100%|██████████| 23515/23515 [02:05<00:00, 188.04it/s]


Epoch [2/3]


Training:   6%|▋         | 1523/23515 [00:11<14:25, 25.41it/s] 

Iter:  25000,  Train Loss:  0.50,  Train Acc: 85.78%,  Dev Loss:  0.35,  Dev Acc: 89.59%,  Time: 0:02:16 *


Training:  28%|██▊       | 6515/23515 [00:37<09:15, 30.60it/s] 

Iter:  30000,  Train Loss:  0.49,  Train Acc: 85.92%,  Dev Loss:  0.34,  Dev Acc: 89.91%,  Time: 0:02:42 *


Training:  49%|████▉     | 11513/23515 [01:04<06:51, 29.14it/s] 

Iter:  35000,  Train Loss:  0.49,  Train Acc: 86.14%,  Dev Loss:  0.32,  Dev Acc: 90.66%,  Time: 0:03:09 *


Training:  70%|███████   | 16505/23515 [01:29<05:33, 21.05it/s] 

Iter:  40000,  Train Loss:  0.48,  Train Acc: 86.20%,  Dev Loss:  0.32,  Dev Acc: 90.59%,  Time: 0:03:35 *


Training:  92%|█████████▏| 21523/23515 [01:54<01:07, 29.70it/s] 

Iter:  45000,  Train Loss:  0.48,  Train Acc: 86.33%,  Dev Loss:  0.32,  Dev Acc: 90.66%,  Time: 0:04:00 *


Training: 100%|██████████| 23515/23515 [02:03<00:00, 190.01it/s]


Epoch [3/3]


Training:  13%|█▎        | 2995/23515 [00:16<11:49, 28.91it/s] 

Iter:  50000,  Train Loss:  0.47,  Train Acc: 86.78%,  Dev Loss:  0.30,  Dev Acc: 91.17%,  Time: 0:04:26 *


Training:  34%|███▍      | 7999/23515 [00:42<08:50, 29.26it/s] 

Iter:  55000,  Train Loss:  0.46,  Train Acc: 87.04%,  Dev Loss:  0.30,  Dev Acc: 91.30%,  Time: 0:04:51 *


Training:  55%|█████▌    | 12993/23515 [01:07<08:18, 21.09it/s] 

Iter:  60000,  Train Loss:  0.46,  Train Acc: 87.12%,  Dev Loss:  0.30,  Dev Acc: 91.26%,  Time: 0:05:16 


Training:  77%|███████▋  | 17989/23515 [01:33<04:23, 20.94it/s] 

Iter:  65000,  Train Loss:  0.45,  Train Acc: 87.17%,  Dev Loss:  0.30,  Dev Acc: 90.92%,  Time: 0:05:42 


Training:  98%|█████████▊| 23000/23515 [02:01<00:21, 24.42it/s] 

Iter:  70000,  Train Loss:  0.45,  Train Acc: 87.20%,  Dev Loss:  0.30,  Dev Acc: 90.94%,  Time: 0:06:10 


Training: 100%|██████████| 23515/23515 [02:04<00:00, 189.44it/s]


### 测试模型

In [14]:
model.load_state_dict(torch.load('../state/02-news/' + config.model_name + '.ckpt'))
model.eval()
start_time = time.time()
dev_acc, dev_loss, test_report, test_confusion = evaluate(config, model, dev_iter, test=True)
msg = 'Dev Loss: {0:>5.2},  Dev Acc: {1:>6.2%}'
print(msg.format(dev_loss, dev_acc))

# 从开发集中随机选择几个样本进行展示
sample_num = 5  
random_samples = random.sample(list(dev_iter), sample_num)

# 展示这些样本的预测结果和实际标签
print("\nSample Predictions:")
for texts, labels in random_samples:
    outputs = model(texts)
    _, predicted = torch.max(outputs, 1)
    for text, label, pred in zip(texts, labels, predicted):
        print("Predicted Label: {}, Actual Label: {}\n".format(pred.item(), label.item()))

Dev Loss:   0.3,  Dev Acc: 91.30%

Sample Predictions:
Predicted Label: 1, Actual Label: 1

Predicted Label: 13, Actual Label: 13

Predicted Label: 10, Actual Label: 10

Predicted Label: 2, Actual Label: 2

Predicted Label: 9, Actual Label: 9

Predicted Label: 13, Actual Label: 13

Predicted Label: 1, Actual Label: 1

Predicted Label: 8, Actual Label: 8

Predicted Label: 4, Actual Label: 4

Predicted Label: 8, Actual Label: 8

