## 卷积神经网络用于自然语言处理示例

In [1]:
# 示例代码运行环境
%load_ext watermark
%watermark -p tensorflow,numpy -v -m

CPython 2.7.6
IPython 5.1.0

tensorflow 1.0.1
numpy 1.12.1

compiler   : GCC 4.8.4
system     : Linux
release    : 4.4.0-21-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 4
interpreter: 64bit


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import time
from collections import Counter
import tensorflow as tf
import jieba

In [21]:
!head -2 train_shuffle.txt

������xp������������������������������������������������������������	1
������������,������������,������������������������,������������������.������������.	1


In [22]:
!ls -lh train_shuffle.txt

-rw-rw-r-- 1 1000 1000 5.1M Apr 13 06:28 train_shuffle.txt


In [19]:
!wc train_shuffle.txt

  24586   42112 5264999 train_shuffle.txt


In [24]:
!du -hs train_shuffle.txt

5.1M	train_shuffle.txt


In [26]:
!ls train_shuffle.txt | head -2

train_shuffle.txt


In [27]:
!ls train_shuffle.txt | wc -l

1


### 处理语料

In [2]:
IGNORE = ' \n' # 忽略的字符
DOC_LENGTH = 50 # 句子预设长度
PADDING = '<PD>' # 句子长度不足时的占位符

In [3]:
def read_docs_and_labels(file):
    """从文件读取样本，去除忽略字符，得到句子和标签列表"""
    with open(file, 'r') as f:
        lines = f.readlines()
    
    docs, labels = [], []
    for line in lines:
        text, label = line.decode('utf-8').split('\t')
        
        words_in_doc = []
        for word in jieba.cut(text):
            if (word not in IGNORE) and (not word.isdigit()):
                words_in_doc.append(word)
        docs.append(words_in_doc)
        labels.append(int(label.strip()))
    return docs, labels

In [9]:
def fix_doc_length(docs):
    """将样本所有词列表调整为固定长度"""
    for i in range(len(docs)):
        if len(docs[i]) < DOC_LENGTH:
            docs[i].extend([PADDING] * (DOC_LENGTH - len(docs[i])))
        else:
            docs[i] = docs[i][:DOC_LENGTH]
    return docs

In [28]:
def get_word_counter(docs):
    flat_words = [w for doc in docs for w in doc]
    return Counter(flat_words)

In [31]:
def build_vocab(word_cnt, limit=3):
    """仅计入出现次数超过 3 的词"""
    vocab = ['UNK']
    for word, count in word_cnt.most_common():
        if count > limit:
            vocab.append(word)
        else:
            break
    return vocab

In [46]:
def docs2idxes(docs):
    """将词列表的列表转换为序号列表的列表"""
    idxes = []
    for doc in docs:
        idxes_of_one_doc = []
        for word in doc:
            idx = idx_dict[word] if (word in vocab) else 0
            idxes_of_one_doc.append(idx)
        idxes.append(idxes_of_one_doc)
    return idxes

In [6]:
train_docs, train_labels = read_docs_and_labels('./train_shuffle.txt')
test_docs, test_labels = read_docs_and_labels('./test_shuffle.txt')

In [10]:
train_docs = fix_doc_length(train_docs)
test_docs = fix_doc_length(test_docs)

In [29]:
word_cnt = get_word_counter(train_docs)

In [32]:
vocab = build_vocab(word_cnt)

In [35]:
vocab_size = len(vocab)

In [45]:
idx_dict = dict(zip(vocab, range(vocab_size))) # 词映射到序号

In [41]:
for w, f in word_cnt.most_common(20):
    print('%s %s' %(w,f))

<PD> 475822
， 62594
的 41658
。 22766
了 15991
是 10897
, 10705
我 8824
很 8072
也 6014
酒店 5796
！ 5677
在 5334
不 5157
. 4714
都 4680
有 4594
就 4288
房间 4206
没有 4067


In [47]:
train_idxes = docs2idxes(train_docs)

In [49]:
inputs_train = np.asarray(train_idxes)
labels_train = np.asarray(train_labels)

In [50]:
test_idxes = docs2idxes(test_docs)
inputs_test = np.asarray(test_idxes)
labels_test = np.asarray(test_labels)

In [51]:
inputs_train.shape

(24586, 50)

In [52]:
inputs_test.shape

(10538, 50)

### 模型

In [54]:
#vocab_size = 80000
word_embed_size = 128
#filter_num = 30
#window_size = 3

In [55]:
tf.reset_default_graph()

随机生成词向量

In [56]:
W = tf.Variable(
                tf.random_uniform([vocab_size, word_embed_size], -1.0, 1.0),
                name="W")

In [57]:
inputs = tf.placeholder(tf.int32, shape=[None, DOC_LENGTH], name='inputs')
labels = tf.placeholder(tf.int32, shape=[None], name='labels')

根据句子的 ID 查找词向量

In [58]:
embeds = tf.nn.embedding_lookup(W, inputs)

In [60]:
# 注意观察 embedding 维度。这里只有一个样本
print(embeds)

Tensor("embedding_lookup:0", shape=(?, 50, 128), dtype=float32)


In [61]:
# 自行查看 expand_dims 的 API 说明。这里是为了适应 conv2d 等参数，拓展了一个维度 (in_channel)，长度是 1
embeds_expand = tf.expand_dims(embeds, -1)

In [63]:
# 扩展之后的维度
print(embeds_expand)

Tensor("ExpandDims:0", shape=(?, 50, 128, 1), dtype=float32)


In [64]:
# max_pool
with tf.name_scope("conv-maxpool"):
    filter_num = 64
    window_size = 3
    filter_shape = [window_size, word_embed_size, 1, filter_num]
    # W 和 b 是卷积的参数
    W = tf.Variable(tf.random_uniform(filter_shape, -1.0, 1.0), name="W")
    # bias 和 filter_num 个数是一样的
    b = tf.Variable(tf.constant(0.0, shape=[filter_num]), name="b")
    # 步长为1，这里不做 Padding，因此句子太短的话可能要丢掉。可自行尝试加 padding（不加也不影响作业评分）
    conv = tf.nn.conv2d(
                    embeds_expand,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
    # 卷积出来的结果加上 bias
    conv_hidden = tf.nn.tanh(tf.add(conv, b), name="tanh")

    # 因为没有 padding，出来的结果个数是 sequence_length - window_size + 1，如果加了 padding 这里要对应更改。
    pool = tf.nn.max_pool(
                    conv_hidden,
                    ksize=[1, DOC_LENGTH - window_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")

目前 tensorflow 还不支持动态 max_pool size，所以 ksize 只能用常数固定，

因为不同句子 sequence_length 不一样，因此目前这里目前还没法做到处理变长句子。
    
一个解决方案是用人工 Padding 的方式，根据语料中最长的句子的长度来扩展所有句子，归一化到统一的长度。即所有句子都通过 Padding 一个特殊符号的方式，扩展为固定长度。

**注意这个是 Tensorflow 目前的限制**，用其他一些支持动态 max_pool 的库不需要 padding。事实上这也会造成计算量的浪费。

鼓励大家多看中间结果的维度，加深理解

In [68]:
print(conv)
print(conv_hidden)
print(pool)

Tensor("conv-maxpool/conv:0", shape=(?, 48, 1, 64), dtype=float32)
Tensor("conv-maxpool/tanh:0", shape=(?, 48, 1, 64), dtype=float32)
Tensor("conv-maxpool/pool:0", shape=(?, 1, 1, 64), dtype=float32)


卷积 + max pooling 之后的结果可以再接 dense layer (全连接层）

根据这个框架改成符合作业要求的脚本，用于情感分类

In [72]:
squeezed_pool = tf.squeeze(pool, [1, 2]) 
# fc = tf.layers.dense(pool, num_fc_hidden, activation=tf.nn.tanh)
raw_output = tf.layers.dense(squeezed_pool, 2)
output = tf.nn.softmax(raw_output)

In [74]:
output.shape

TensorShape([Dimension(None), Dimension(2)])

In [76]:
cost = tf.reduce_mean(
    tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=raw_output, labels=labels))

### 训练

In [77]:
def evaluate_model(sess, inputs_, labels_, print_matrix=False):
    """评估模型指标，并打印输出"""
    pred_prob = sess.run(output, feed_dict={inputs:inputs_, labels:labels_})
    preds = np.asarray((pred_prob[:, 1] > 0.5), dtype=int)
    mat = sess.run(tf.confusion_matrix(labels_, preds))
    tn, fp, fn,  tp = mat.reshape(4)
    precision = np.float(tp) / (tp + fp)
    recall = np.float(tp) / (tp + fn)
    if print_matrix:
        print('confusion matrix:\n', mat)
    print('precision:% .3f, recall: %.3f' %(precision, recall))

In [84]:
learning_rate = 1
batch_size = 100
epochs = 3000
print_cost_every = 100

feed_train = {inputs: inputs_train, labels: labels_train}
feed_test = {inputs: inputs_test, labels: labels_test}

sess = tf.Session()
sess.run(tf.global_variables_initializer())
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

costs_train = []
costs_test = []
start_time = time.time()

num_inputs = len(labels_train)
order = np.arange(num_inputs)
np.random.shuffle(order)

try:
    for i in range(epochs):
        if i % print_cost_every == 0:
            cost_train = sess.run(cost, feed_dict=feed_train)
            cost_test = sess.run(cost, feed_dict=feed_test)
            print('Epoch %d cost: train: %s / test: %s' %(i, cost_train, cost_test))
            costs_train.append(cost_train)
            costs_test.append(cost_test)
            evaluate_model(sess, inputs_test, labels_test)
        for j in range(0, num_inputs, batch_size):
            batch_index = order[j: j + batch_size]
            batch_inputs = inputs_train[batch_index]
            batch_labels = labels_train[batch_index]
            batch_feed = {inputs: batch_inputs, labels: batch_labels}
            sess.run(train_step, feed_dict=batch_feed)
except KeyboardInterrupt:
    print('Interrupted')
finally:
    end_time = time.time()
    print('\ntime: %s' % (end_time - start_time))

Epoch 0 cost: train: 0.84746 / test: 0.849917
precision: 0.469, recall: 0.983
Epoch 100 cost: train: 0.0132941 / test: 0.917752
precision: 0.885, recall: 0.882
Epoch 200 cost: train: 0.0108253 / test: 1.04141
precision: 0.886, recall: 0.888
Epoch 300 cost: train: 0.00827338 / test: 1.09468
precision: 0.888, recall: 0.884
Epoch 400 cost: train: 0.00807151 / test: 1.13787
precision: 0.889, recall: 0.884
Epoch 500 cost: train: 0.00729369 / test: 1.17622
precision: 0.890, recall: 0.883
Epoch 600 cost: train: 0.00712899 / test: 1.21371
precision: 0.890, recall: 0.881
Epoch 700 cost: train: 0.0064577 / test: 1.22829
precision: 0.890, recall: 0.879
Epoch 800 cost: train: 0.00636252 / test: 1.24298
precision: 0.892, recall: 0.880
Epoch 900 cost: train: 0.00632875 / test: 1.24938
precision: 0.891, recall: 0.881
Epoch 1000 cost: train: 0.0062647 / test: 1.25138
precision: 0.891, recall: 0.881
Epoch 1100 cost: train: 0.00611953 / test: 1.248
precision: 0.891, recall: 0.881
Epoch 1200 cost: train: