In [1]:
from pyhanlp import HanLP
import numpy as np
from tqdm import tqdm

# 读取原始数据集分词预处理 并保存词典
def read_toutiao_dataset(data_path, save_vocab_path):
    with open(data_path, "r", encoding="utf8") as fo:
        all_lines = fo.readlines()
    datas, labels = [], []
    word_vocabs = {}
    for line in tqdm(all_lines):
        content_words = []
        category, content = line.strip().split("_!_")
        for term in HanLP.segment(content):
            if term.word not in word_vocabs:
                word_vocabs[term.word] = len(word_vocabs)+1
            content_words.append(term.word)
        datas.append(content_words)
        labels.append(category)
    with open(save_vocab_path, "w", encoding="utf8") as fw:
        for word, index in word_vocabs.items():
            fw.write(word+"\n")
    return datas, labels

# 读取词典 生成词-索引对应关系, 其中special_words = ['<PAD>', '<UNK>']
def read_word_vocabs(save_vocab_path, special_words):
    with open(save_vocab_path, "r", encoding="utf8") as fo:
        word_vocabs = [word.strip() for word in fo]
    word_vocabs = special_words + word_vocabs
    idx2vocab = {idx: char for idx, char in enumerate(word_vocabs)}
    vocab2idx = {char: idx for idx, char in idx2vocab.items()}
    return idx2vocab, vocab2idx

# 把预处理过的数据索引化 即变成词编号序列
def process_dataset(datas, labels, category2idx, vocab2idx):
    new_datas, new_labels = [], []
    for data, label in zip(datas, labels):
        index_data = [vocab2idx[word] if word in vocab2idx else vocab2idx['<UNK>'] for word in data]
        index_label = category2idx[label]
        new_datas.append(index_data)
        new_labels.append(index_label)
    return new_datas, new_labels

In [2]:
data_path = "./data/toutiao_news_dataset.txt"
save_vocab_path = "./data/word_vocabs.txt"
special_words = ['<PAD>', '<UNK>']
category_lists = ["民生故事","文化","娱乐","体育","财经","房产","汽车","教育","科技","军事",
                "旅游","国际","证券股票","农业","电竞游戏"]

category2idx = {cate: idx for idx, cate in enumerate(category_lists)}
idx2category = {idx: cate for idx, cate in enumerate(category_lists)}

datas, labels = read_toutiao_dataset(data_path, save_vocab_path)
idx2vocab, vocab2idx = read_word_vocabs(save_vocab_path, special_words)
all_datas, all_labels = process_dataset(datas, labels, category2idx, vocab2idx)


100%|█████████████████████████████████| 382688/382688 [12:34<00:00, 507.49it/s]


In [3]:
print(datas[0], labels[0])
print(idx2vocab[0], idx2vocab[1], idx2vocab[5])
print(idx2category[0], idx2category[1], idx2category[2])

['京城', '最', '值得', '你', '来', '场', '文化', '之旅', '的', '博物馆'] 文化
<PAD> <UNK> 你
民生故事 文化 娱乐


In [None]:
import numpy
import keras
from keras import backend as K
from keras import activations
from keras.engine.topology import Layer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.models import Model
from keras.layers import Input, Dense, Embedding, LSTM, Bidirectional
K.clear_session()

class AttentionLayer(Layer):
    def __init__(self, attention_size=None, **kwargs):
        self.attention_size = attention_size
        super(AttentionLayer, self).__init__(**kwargs)
        
    def get_config(self):
        config = super().get_config()
        config['attention_size'] = self.attention_size
        return config
        
    def build(self, input_shape):
        assert len(input_shape) == 3
        
        self.time_steps = input_shape[1]
        hidden_size = input_shape[2]
        if self.attention_size is None:
            self.attention_size = hidden_size
            
        self.W = self.add_weight(name='att_weight', shape=(hidden_size, self.attention_size),
                                initializer='uniform', trainable=True)
        self.b = self.add_weight(name='att_bias', shape=(self.attention_size,),
                                initializer='uniform', trainable=True)
        self.V = self.add_weight(name='att_var', shape=(self.attention_size,),
                                initializer='uniform', trainable=True)
        super(AttentionLayer, self).build(input_shape)
    
    def call(self, inputs):
        self.V = K.reshape(self.V, (-1, 1))
        H = K.tanh(K.dot(inputs, self.W) + self.b)
        score = K.softmax(K.dot(H, self.V), axis=1)
        outputs = K.sum(score * inputs, axis=1)
        return outputs
    
    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[2]
    
    
def create_classify_model(max_len, vocab_size, embedding_size, hidden_size, attention_size, class_nums):
    inputs = Input(shape=(max_len,), dtype='int32')
    x = Embedding(vocab_size, embedding_size)(inputs)
    x = Bidirectional(LSTM(hidden_size, dropout=0.2, return_sequences=True))(x)
    x = AttentionLayer(attention_size=attention_size)(x)
    outputs = Dense(class_nums, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.summary() # 输出模型结构和参数数量
    return model

MAX_LEN = 30
VOCAB_SIZE = len(vocab2idx)
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 64
ATT_SIZE = 50
CLASS_NUMS = len(category2idx)
BATCH_SIZE = 64
EPOCHS = 20

all_datas = all_datas[:10000]
all_labels = all_labels[:10000]

count = len(all_labels)
rate1, rate2 = 0.8, 0.9 # train-0.8, test-0.1, dev-0.1
# padding the data
new_datas = sequence.pad_sequences(all_datas, maxlen=MAX_LEN)
new_labels = keras.utils.to_categorical(all_labels, CLASS_NUMS)
# split all data to train, test and dev
x_train, y_train = new_datas[:int(count*rate1)], new_labels[:int(count*rate1)]
x_test, y_test = new_datas[int(count*rate1):int(count*rate2)], new_labels[int(count*rate1):int(count*rate2)]
x_dev, y_dev = new_datas[int(count*rate2):], new_labels[int(count*rate2):]

# create model
model = create_classify_model(MAX_LEN, VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, ATT_SIZE, CLASS_NUMS)
# loss and optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# train model
model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(x_test, y_test))
# test model 

In [None]:
score, acc = model.evaluate(x_dev, y_dev, batch_size=BATCH_SIZE)
print('Test score:', score)
print('Test accuracy:', acc)
# save model
model.save("./model/news_classify_model.h5")

In [15]:
import numpy as np
from pyhanlp import HanLP
np.set_printoptions(suppress=True)

maxlen = 30
content = "网友数博会：大数据与实体经济融合发展，贵州焕发新动力"
content_words = [term.word for term in HanLP.segment(content)]
sent2id = [vocab2idx[word] if word in vocab2idx else vocab2idx['<UNK>'] for word in content_words]
sent2id_new = np.array([sent2id[:maxlen] + [0] * (maxlen-len(sent2id))])

y_pred = model.predict(sent2id_new)
print(y_pred)
y_label = np.argmax(y_pred[0])
print(y_label, idx2category[y_label])

[[0.00109552 0.00387199 0.21370623 0.00758541 0.5126246  0.00125148
  0.00908471 0.01694475 0.01339941 0.00030947 0.00917827 0.00754309
  0.08456795 0.1168716  0.00196562]]
4 财经


In [25]:
from keras.models import load_model
import numpy as np
from pyhanlp import HanLP
np.set_printoptions(suppress=True)

save_vocab_path = "./data/word_vocabs.txt"
model_path = "./model/news_classify_model.h5"
special_words = ['<PAD>', '<UNK>']
category_lists = ["民生故事","文化","娱乐","体育","财经","房产","汽车","教育","科技","军事",
                "旅游","国际","证券股票","农业","电竞游戏"]
maxlen = 30
ATT_SIZE = 50

category2idx = {cate: idx for idx, cate in enumerate(category_lists)}
idx2category = {idx: cate for idx, cate in enumerate(category_lists)}
idx2vocab, vocab2idx = read_word_vocabs(save_vocab_path, special_words)

content = "网友数博会：大数据与实体经济融合发展，贵州焕发新动力"
content_words = [term.word for term in HanLP.segment(content)]
sent2id = [vocab2idx[word] if word in vocab2idx else vocab2idx['<UNK>'] for word in content_words]
sent2id_new = np.array([sent2id[:maxlen] + [0] * (maxlen-len(sent2id))])

model = load_model(model_path, custom_objects={'AttentionLayer': AttentionLayer(ATT_SIZE)}, compile=False)
y_pred = model.predict(sent2id_new)
print(y_pred)
result = {}
for idx, pred in enumerate(y_pred[0]):
    result[idx2category[idx]] = pred
result_sorted = sorted(result.items(), key=lambda item: item[1], reverse=True)
print(result_sorted)
# y_label = np.argmax(y_pred[0])
# print(y_label, idx2category[y_label])


[[0.00109552 0.00387199 0.21370623 0.00758541 0.5126246  0.00125148
  0.00908471 0.01694475 0.01339941 0.00030947 0.00917827 0.00754309
  0.08456795 0.1168716  0.00196562]]
[('财经', 0.5126246), ('娱乐', 0.21370623), ('农业', 0.1168716), ('证券股票', 0.08456795), ('教育', 0.016944751), ('科技', 0.01339941), ('旅游', 0.00917827), ('汽车', 0.009084707), ('体育', 0.0075854124), ('国际', 0.007543085), ('文化', 0.0038719943), ('电竞游戏', 0.0019656185), ('房产', 0.0012514826), ('民生故事', 0.0010955166), ('军事', 0.0003094678)]
