[参考地址](https://blog.csdn.net/BGH12ET/article/details/108006895?spm=5176.21852664.0.0.3bf33dd7hLnyUq)

In [63]:
import random
import numpy as np
import pandas as pd
import torch

seed = 666
random.seed(seed)

In [64]:
def all_data2fold(fold_num, num=10000):
    fold_data = []
    # df_all = pd.read_csv('data/train_set.csv',sep='\t', encoding='utf-8')
    df_all = pd.read_csv('data/train_set_sample.csv', sep='\t', encoding='utf-8')
    df_all_shuffle = df_all.loc[np.random.permutation(df_all.index), :].tail(num)
    all_texts = df_all_shuffle['text'].tolist()
    all_labels = df_all_shuffle['label'].astype(str).tolist()

    label2id = {}
    for id, label in enumerate(all_labels):
        if label in label2id:
            label2id[label].append(id)
        else:
            label2id[label] = [id]

    # for text, label in zip(texts, labels):
    #     if label in label2id:
    #         label2id[label].append(text)
    #     else:
    #         label2id[label] = [text]

    all_index = [[] for _ in range(fold_num)]
    for label, data in label2id.items():
        batch_size = int(len(data) / fold_num)
        # other 表示多出来的数据，other 的数据量是小于 fold_num 的
        other = len(data) - batch_size * fold_num
        # 把每一类对应的 index，添加到每个 fold 里面去
        for i in range(fold_num):
            # 如果 i < other，那么将一个数据添加到这一轮 batch 的数据中
            cur_batch_size = batch_size + 1 if i < other else batch_size
            # print(cur_batch_size)
            # batch_data 是该轮 batch 对应的索引
            batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)]
            all_index[i].extend(batch_data)

    batch_size = int(len(all_labels) / fold_num)
    other_texts = []
    other_labels = []
    other_num = 0
    start = 0
    # 由于上面在分 batch 的过程中，每个 batch 的数据量不一样，这里是把数据平均到每个 batch
    for fold in range(fold_num):
        num = len(all_index[fold])
        texts = [all_texts[i] for i in all_index[fold]]
        labels = [all_labels[i] for i in all_index[fold]]

        if num > batch_size:  # 如果大于 batch_size 那么就取 batch_size 大小的数据
            fold_texts = texts[:batch_size]
            other_texts.extend(texts[batch_size:])
            fold_labels = labels[:batch_size]
            other_labels.extend(labels[batch_size:])
            other_num += num - batch_size
        elif num < batch_size:  # 如果小于 batch_size，那么就补全到 batch_size 的大小
            end = start + batch_size - num
            fold_texts = texts + other_texts[start: end]
            fold_labels = labels + other_labels[start: end]
            start = end
        else:
            fold_texts = texts
            fold_labels = labels

        assert batch_size == len(fold_labels)

        # shuffle
        index = list(range(batch_size))
        np.random.shuffle(index)
        # 这里是为了打乱数据
        shuffle_fold_texts = []
        shuffle_fold_labels = []
        for i in index:
            shuffle_fold_texts.append(fold_texts[i])
            shuffle_fold_labels.append(fold_labels[i])

        data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
        fold_data.append(data)

    return fold_data


In [69]:
# fold_data 字典组成的数组
fold_data = all_data2fold(fold_num=10)
print(fold_data[0])

{'label': ['4', '1', '5', '7', '0', '8', '3', '1', '1', '0', '9', '1', '0', '2', '0', '5', '5', '2', '2', '8', '10', '2', '8', '2', '1', '0', '0', '11', '0', '3', '2', '3', '13', '0', '8', '1', '6', '5', '2', '3', '3', '6', '1', '0', '7', '0', '2', '1', '5', '0', '6', '0', '2', '1', '1', '0', '3', '1', '5', '2', '0', '1', '1', '11', '5', '3', '6', '9', '1', '9', '7', '1', '6', '1', '1', '2', '0', '10', '2', '0', '2', '8', '3', '2', '2', '7', '1', '0', '2', '10', '3', '1', '1', '3', '7', '0', '0', '3', '3', '1'], 'text': ['2042 5006 3568 3615 4218 3203 2695 5176 3598 2212 5537 4893 1613 1906 6802 2515 3530 5505 1375 5393 2112 2400 7495 4559 6065 2073 3659 4646 4853 1407 2465 2397 5778 6714 5822 803 2400 6115 2205 1635 3530 6508 3750 6714 5822 7495 5284 6357 5192 7451 3686 5858 5949 3067 6695 3641 2042 5006 3568 3615 4218 3203 2695 3750 3659 1324 4853 2058 134 7495 5284 6357 1388 2791 5689 4553 7417 3764 2376 3598 900 4553 7417 3764 6832 2376 3750 1465 669 1815 1879 7492 2446 803 1362 15

### 4.1.2 拆分训练、验证

In [77]:
valid_fold_id = 9
valid_data = fold_data[valid_fold_id]

train_data = {'label': [], 'text': []}
for fold_id in range(valid_fold_id):
    train_data.update(
        {'label': train_data['label'] + fold_data[fold_id]['label'],
         'text': train_data['text'] + fold_data[fold_id]['text']})

print(len(train_data['text']))

900


In [78]:
# build vocab
from collections import Counter
from transformers import BasicTokenizer

basic_tokenizer = BasicTokenizer()


# Vocab 的作用是：
# 1. 创建 词 和 index 对应的字典，这里包括 2 份字典，分别是：_id2word 和 _id2extword
# 其中 _id2word 是从新闻得到的， 把词频小于 5 的词替换为了 UNK。对应到模型输入的 batch_inputs1。
# _id2extword 是从 word2vec.txt 中得到的，有 5976 个词。对应到模型输入的 batch_inputs2。
# 后面会有两个 embedding 层，其中 _id2word 对应的 embedding 是可学习的，_id2extword 对应的 embedding 是从文件中加载的，是固定的
# 2.创建 label 和 index 对应的字典

class Vocab():
    def __init__(self, train_data):
        self.min_count = 5
        self.pad = 0
        self.unk = 1
        self._id2word = ['[PAD]', '[UNK]']
        self._id2extword = ['[PAD]', '[UNK]']

        self._id2label = []
        self.target_names = []

        self.build_vocab(train_data)

        reverse = lambda x: dict(zip(x, range(len(x))))
        #创建词和 index 对应的字典
        self._word2id = reverse(self._id2word)
        #创建 label 和 index 对应的字典
        self._label2id = reverse(self._id2label)

        print("Build vocab: words {}, labels {}.".format(self.word_size, self.label_size))

    #创建词典
    def build_vocab(self, data):
        self.word_counter = Counter()
        #计算每个词出现的次数
        for text in data['text']:
            words = text.split()
            for word in words:
                self.word_counter[word] += 1
        # 去掉频次小于 min_count = 5 的词，把词存到 _id2word
        for word, count in self.word_counter.most_common():
            if count >= self.min_count:
                self._id2word.append(word)

        label2name = {0: '科技', 1: '股票', 2: '体育', 3: '娱乐', 4: '时政', 5: '社会', 6: '教育', 7: '财经',
                      8: '家居', 9: '游戏', 10: '房产', 11: '时尚', 12: '彩票', 13: '星座'}

        self.label_counter = Counter(data['label'])

        for label in range(len(self.label_counter)):
            count = self.label_counter[label]  # 取出 label 对应的次数
            self._id2label.append(label)
            self.target_names.append(label2name[label])  # 根据label数字取出对应的名字

    def load_pretrained_embs(self, embfile):
        with open(embfile, encoding='utf-8') as f:
            lines = f.readlines()
            items = lines[0].split()
            # 第一行分别是单词数量、词向量维度
            word_count, embedding_dim = int(items[0]), int(items[1])

        index = len(self._id2extword)
        embeddings = np.zeros((word_count + index, embedding_dim))
        # 下面的代码和 word2vec.txt 的结构有关
        for line in lines[1:]:
            values = line.split()
            self._id2extword.append(values[0])  # 首先添加第一列的单词
            vector = np.array(values[1:], dtype='float64')  # 然后添加后面 100 列的词向量
            embeddings[self.unk] += vector
            embeddings[index] = vector
            index += 1

        # unk 的词向量是所有词的平均
        embeddings[self.unk] = embeddings[self.unk] / word_count
        # 除以标准差干嘛？
        embeddings = embeddings / np.std(embeddings)

        reverse = lambda x: dict(zip(x, range(len(x))))
        self._extword2id = reverse(self._id2extword)

        assert len(set(self._id2extword)) == len(self._id2extword)

        return embeddings

    # 根据单词得到 id
    def word2id(self, xs):
        if isinstance(xs, list):
            return [self._word2id.get(x, self.unk) for x in xs]
        return self._word2id.get(xs, self.unk)

    # 根据单词得到 ext id
    def extword2id(self, xs):
        if isinstance(xs, list):
            return [self._extword2id.get(x, self.unk) for x in xs]
        return self._extword2id.get(xs, self.unk)

    # 根据 label 得到 id
    def label2id(self, xs):
        if isinstance(xs, list):
            return [self._label2id.get(x, self.unk) for x in xs]
        return self._label2id.get(xs, self.unk)

    @property
    def word_size(self):
        return len(self._id2word)

    @property
    def extword_size(self):
        return len(self._id2extword)

    @property
    def label_size(self):
        return len(self._id2label)


vocab = Vocab(train_data)

Build vocab: words 2821, labels 14.


In [None]:
import torch
from torch import nn


class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()
