In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras

  from ._conv import register_converters as _register_converters


In [4]:
import sys
from collections import Counter

In [5]:
import os

In [6]:
# 文件路径设置
base_dir = '../../../dataset/cnews/'
train_dir = os.path.join(base_dir,'cnews.train.txt')
test_dir = os.path.join(base_dir,'cnews.test.txt')
val_dir = os.path.join(base_dir,'cnews.val.txt')
vocab_dir = os.path.join(base_dir,'cnews.vocab.txt')

In [7]:
# 功能函数
# 打开文件
def open_file(filename,mode='r'):
    return open(filename,mode,encoding='utf-8')
# 读取文件
def read_file(filename):
    contents,labels = [],[]
    with open_file(filename) as f:
        for line in f:
            try:
                label,content = line.strip().split('\t')
                if content:
                    contents.append(list(content))
                    labels.append(label)
            except:
                pass
        return contents,labels

In [8]:
# 根据训练集构建词汇表，存储
def build_vocab(train_dir,vocab_dir,vocab_size=5000):
    data_train,_ = read_file(train_dir)
    all_data = []
    for content in data_train:
        all_data.extend(content)
    counter = Counter(all_data)
    count_pairs = counter.most_common(vocab_size - 1) # 此处减去1是因为PAD占据0位
    words,_ = list(zip(*count_pairs))
    # 添加一个<PAD>来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    open_file(vocab_dir,mode='w').write('\n'.join(words) + '\n')

In [9]:
# 读取词汇表
def read_vocab(vocab_dir):
    with open_file(vocab_dir) as f:
        words = [_.strip() for _ in f.readlines()]
    word_to_id = dict(zip(words,range(len(words))))
    return words,word_to_id

In [13]:
# 读取分类目录，目录固定
def read_category():
    categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
    cat_to_id = dict(zip(categories,range(len(categories))))
    return categories,cat_to_id

In [14]:
def to_words(content,words):
    '''
    将id表示的内容转换为文字
    '''
    return ''.join(words[x] for x in content)

In [15]:
def process_file(filename,word_to_id,cat_to_id,max_length=600):
    '''
    将文件转换为id表示
    '''
    contents,labels = read_file(filename)
    data_id,label_id = [],[]
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])
    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = keras.preprocessing.sequence.pad_sequences(data_id,max_length)
    y_pad = keras.preprocessing.sequence.pad_sequences(label_id,num_classes=len(cat_to_id))
    
    return x_pad,y_pad

In [16]:
def batch_iter(x,y,batch_size=64):
    '''
    生成批次数据
    '''
    data_len = len(x)
    num_batch = int((data_len - 1)/batch_size) + 1
    
    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]
    
    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size,data_len)
        yield x_shuffle[start_id:end_id],y_shuffle[start_id:end_id]