In [1]:
import jieba
import pandas as pd
df_technology = pd.read_csv("data/technology_news.csv", encoding='utf-8')
df_technology = df_technology.dropna()

df_car = pd.read_csv("data/car_news.csv", encoding='utf-8')
df_car = df_car.dropna()

df_entertainment = pd.read_csv("data/entertainment_news.csv", encoding='utf-8')
df_entertainment = df_entertainment.dropna()

df_military = pd.read_csv("data/military_news.csv", encoding='utf-8')
df_military = df_military.dropna()

df_sports = pd.read_csv("data/sports_news.csv", encoding='utf-8')
df_sports = df_sports.dropna()

technology = df_technology.content.values.tolist()[1000:21000]
car = df_car.content.values.tolist()[1000:21000]
entertainment = df_entertainment.content.values.tolist()[:20000]
military = df_military.content.values.tolist()[:20000]
sports = df_sports.content.values.tolist()[:20000]

In [2]:
stopwords=pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
stopwords=stopwords['stopword'].values

In [3]:
def preprocess_text(content_lines, sentences, category):
    for line in content_lines:
            segs=jieba.lcut(line)
            segs = list(filter(lambda x:len(x)>1, segs))
            segs = list(filter(lambda x:x not in stopwords, segs))
            sentences.append((" ".join(segs), category))
            data.append(segs)

#生成训练数据
sentences = []
data = []

#preprocess_text(technology, sentences, 'technology')
preprocess_text(car, sentences, 'car')
#preprocess_text(entertainment, sentences, 'entertainment')
preprocess_text(military, sentences, 'military')
#preprocess_text(sports, sentences, 'sports')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\LL\AppData\Local\Temp\jieba.cache
Loading model cost 0.804 seconds.
Prefix dict has been built succesfully.


### 上面data表示的是标准的Word2Vec的输入

In [4]:
from sklearn.model_selection import train_test_split
x, y = zip(*sentences)
train_data, test_data, train_target, test_target = train_test_split(x, y, random_state=1234)

In [5]:
cate_dic = {'car':1, 'military':2}
train_target = list(map(lambda x:cate_dic[x], train_target))
test_target = list(map(lambda x:cate_dic[x], test_target))
y_train = pd.Series(train_target)
y_test = pd.Series(test_target)

### 先利用gensim的word2Vec得到词向量

In [6]:
from gensim.models.word2vec import Word2Vec  #导入Word2vec

Using TensorFlow backend.


In [8]:
#设定参数
size = 50
window = 5
min_count = 4
worker = 1

In [9]:
model = Word2Vec(data, size=size, window=window, min_count=min_count, workers=worker)

In [10]:
model.most_similar('上线')

[('直播', 0.984207034111023),
 ('在线', 0.9761182069778442),
 ('宝驾', 0.9708111882209778),
 ('绿狗', 0.9697870016098022),
 ('友友', 0.9652999639511108),
 ('曹操', 0.9648972749710083),
 ('网约', 0.9624991416931152),
 ('许可证', 0.9624050259590149),
 ('涉密', 0.9596567153930664),
 ('咨询', 0.958770751953125)]

###  得到CNN的输入向量

In [11]:
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from keras.layers import Activation
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM

### padding一下

In [12]:
# 说明，对于每天的新闻，我们会考虑前256个单词。不够的我们用[000000]补上
# vec_size 指的是我们本身vector的size
def transform_to_matrix(x, padding_size=50, vec_size=50):
    res = []
    for sen in x:
        sen = sen.split()  #这里中英文是不一样的
        matrix = []
        for i in range(padding_size):
            try:
                matrix.append(model[sen[i]].tolist())
            except:
                # 这里有两种except情况，
                # 1. 这个单词找不到
                # 2. sen没那么长
                # 不管哪种情况，我们直接贴上全是0的vec
                matrix.append([0] * vec_size)
        res.append(matrix)
    return res

### 处理输入数据

In [13]:
test_x = transform_to_matrix(test_data)

In [15]:
train_x = transform_to_matrix(train_data)

In [16]:
# 搞成np的数组，便于处理
train_x = np.array(train_x)
test_x = np.array(test_x)

# 看看数组的大小
print(train_x.shape)
print(test_x.shape)

(20696, 50, 50)
(6899, 50, 50)


In [20]:
X_train, X_test = train_x, test_x
X_train = X_train.reshape(X_train.shape[0],X_train.shape[1], X_train.shape[2], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)

print(X_train.shape)
print(X_test.shape)

(20696, 50, 50, 1)
(6899, 50, 50, 1)


In [25]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers.core import Dense, Dropout, Activation, Flatten

# set parameters:
batch_size = 32
n_filter = 16
filter_length = 3
nb_epoch = 5
n_pool = 2

# 新建一个sequential的模型
model = Sequential()
model.add(Conv2D(n_filter,(filter_length,filter_length),
                        input_shape=(50, 50, 1)))
model.add(Activation('relu'))
model.add(Conv2D(n_filter,(filter_length,filter_length)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(n_pool, n_pool)))
model.add(Dropout(0.25))
model.add(Flatten())
# 后面接上一个ANN
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('softmax'))
# compile模型
model.compile(loss='mse',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=3, batch_size=32)

Epoch 1/3