In [1]:
# 所有需要导入的库

import re
import ast
import redis
import pandas as pd
import numpy as np
from pyecharts import Bar, Pie
from keras.utils.np_utils import to_categorical
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
# 从 redis 中读取数据, 生成 dataframe

cli = redis.Redis()
data = cli.hgetall('chinanews')
df = pd.DataFrame([ast.literal_eval(data[k]) for k in data])
print len(df)

2299879


In [3]:
# 查看分类

categories = df.groupby('category').size()
pie = Pie("分类")
pie.add("", categories.index.tolist(), categories.values.tolist(), is_label_show=True, is_legend_show=False)
pie

In [4]:
# 过滤一些比较少的类别, 以及相关性不强的类别

categories = df.groupby('category').size()
categories = categories[categories > 20000].index.tolist()
# 设置 map 方法
filted = df['category'].map(lambda x: x in categories)
# 然后应用到 dataframe 上
df_filted = df[filted]

df_filted = df_filted[ (df_filted['category'] != u'图片') & (df_filted['category'] != u'视频') & (df_filted['category'] != u'报摘')]
print len(df_filted)

2178902


In [5]:
categories = df_filted.groupby('category').size()
pie = Pie("分类")
pie.add("", categories.index.tolist(), categories.values.tolist(), is_label_show=True, is_legend_show=False)
pie

In [6]:
# 类别名称映射字典

catagories = df_filted.groupby('category').size().index.tolist()
catagory_dict = {}
int_catagory = {}
for i, k in enumerate(catagories):
    catagory_dict.update({k:i})
    int_catagory.update({i:k})

In [7]:
# 类别名称映射为数字

df_filted['c2id'] = df_filted['category'].apply(lambda x: catagory_dict[x])
df_filted.head()

Unnamed: 0,category,title,c2id
0,港澳,港股上升 创七年新高,15
1,文化,峨眉山天气原因关闭高山区 车辆最高驶至万年寺,12
2,I T,100个APP一起参与 苏宁开启818“社团式营销”,0
3,华人,外媒披露姚明或买NBA洛杉矶快船队 华人乐观其成,4
4,社会,浙江一对80岁夫妇连续17年打工助学,17


In [8]:
# 开始准备数据

prepared_data = df_filted[ ['title', 'c2id'] ]
prepared_data.head()

Unnamed: 0,title,c2id
0,港股上升 创七年新高,15
1,峨眉山天气原因关闭高山区 车辆最高驶至万年寺,12
2,100个APP一起参与 苏宁开启818“社团式营销”,0
3,外媒披露姚明或买NBA洛杉矶快船队 华人乐观其成,4
4,浙江一对80岁夫妇连续17年打工助学,17


In [9]:
# 句子分割成汉字列表处理

prepared_data['words'] = prepared_data['title'].apply(lambda x: re.findall('[\x80-\xff]{3}|[\w\W]', x))
prepared_data.head()

Unnamed: 0,title,c2id,words
0,港股上升 创七年新高,15,"[港, 股, 上, 升, , 创, 七, 年, 新, 高]"
1,峨眉山天气原因关闭高山区 车辆最高驶至万年寺,12,"[峨, 眉, 山, 天, 气, 原, 因, 关, 闭, 高, 山, 区, , 车, 辆, ..."
2,100个APP一起参与 苏宁开启818“社团式营销”,0,"[1, 0, 0, 个, A, P, P, 一, 起, 参, 与, , 苏, 宁, 开, ..."
3,外媒披露姚明或买NBA洛杉矶快船队 华人乐观其成,4,"[外, 媒, 披, 露, 姚, 明, 或, 买, N, B, A, 洛, 杉, 矶, 快, ..."
4,浙江一对80岁夫妇连续17年打工助学,17,"[浙, 江, 一, 对, 8, 0, 岁, 夫, 妇, 连, 续, 1, 7, 年, 打, ..."


In [10]:
# 生成汉字的映射字典

all_words = []
for w in prepared_data['words']:
    all_words.extend(w)
word_dict = pd.DataFrame(pd.Series(all_words).value_counts())
word_dict['id'] = list(range(1, len(word_dict)+1))
print len(word_dict)

6790


In [11]:
# 将汉字映射为数字, 并将序列截取为固定的长度, 执行时间比较久 

prepared_data['w2v'] = prepared_data['words'].apply(lambda x: list(word_dict['id'][x]))

maxlen = 25
prepared_data['w2v'] = list(sequence.pad_sequences(prepared_data['w2v'], maxlen=maxlen))

In [12]:
# 最终准备好的 dataframe

prepared_data.head()

Unnamed: 0,title,c2id,words,w2v
0,港股上升 创七年新高,15,"[港, 股, 上, 升, , 创, 七, 年, 新, 高]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,峨眉山天气原因关闭高山区 车辆最高驶至万年寺,12,"[峨, 眉, 山, 天, 气, 原, 因, 关, 闭, 高, 山, 区, , 车, 辆, ...","[0, 0, 0, 3114, 2193, 174, 93, 234, 385, 264, ..."
2,100个APP一起参与 苏宁开启818“社团式营销”,0,"[1, 0, 0, 个, A, P, P, 一, 起, 参, 与, , 苏, 宁, 开, ...","[3, 3, 127, 253, 204, 204, 13, 155, 508, 119, ..."
3,外媒披露姚明或买NBA洛杉矶快船队 华人乐观其成,4,"[外, 媒, 披, 露, 姚, 明, 或, 买, N, B, A, 洛, 杉, 矶, 快, ...","[0, 102, 203, 1579, 900, 1609, 152, 178, 481, ..."
4,浙江一对80岁夫妇连续17年打工助学,17,"[浙, 江, 一, 对, 8, 0, 岁, 夫, 妇, 连, 续, 1, 7, 年, 打, ...","[0, 0, 0, 0, 0, 0, 0, 659, 181, 13, 110, 48, 3..."


In [13]:
# 3:1 随机分割数据, 生成训练数据和测试数据

seed = 7
X = np.array(list(prepared_data['w2v']))
Y = np.array(list(prepared_data['c2id']))
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=seed)

In [14]:
# to_categorical 处理

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# 训练数据 shape

y_train.shape

(1634176, 22)

In [15]:
# 终于可以创建 model 了

model = Sequential()
model.add(Embedding(len(word_dict)+1, 256))
model.add(LSTM(256))
model.add(Dropout(0.5))
model.add(Dense(y_train.shape[1]))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# 或者使用 GRU

# model = Sequential()
# model.add(Embedding(len(word_dict)+1, 256))
# model.add(GRU(256))
# model.add(Dense(y_train.shape[1]))
# model.add(Activation('softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# 或者使用 BiLSTM + CNN

# embedding_size=128
# hidden_size=256

# model = Sequential()
# model.add(Embedding(input_dim=len(word_dict)+1, output_dim=128, input_length=25))
# model.add(Bidirectional(LSTM(256, return_sequences=True)))
# model.add(TimeDistributed(Dense(64)))
# model.add(Activation('softplus'))
# model.add(MaxPooling1D(5))
# model.add(Flatten())
# model.add(Dense(y_train.shape[1]))
# model.add(Activation('softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
# 训练

model.fit(x_train, y_train, batch_size=128, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f04c448ffd0>

In [17]:
# 测试

model.evaluate(x=x_test, y=y_test)



[0.96994108563143733, 0.68709773354161929]

In [18]:
# 保存模型和数据

model.save('model.hdf5')

import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

save_obj(int_catagory, 'int_catagory')
save_obj(catagory_dict, 'catagory_dict')
word_dict.to_csv('word_dict.csv', encoding='utf8')
prepared_data.to_csv('prepared_data.csv', encoding='utf8')