### 作业1

测试你的 Language Model 在 batch_size = 1 VS batch_size = 125 时，预测完一定数量样本所需要的时间比（模型算出哪个词概率最高即可）

In [1]:
# 示例代码运行环境
%load_ext watermark
%watermark -p tensorflow,numpy -v -m

CPython 2.7.6
IPython 5.1.0

tensorflow 1.0.1
numpy 1.12.1

compiler   : GCC 4.8.4
system     : Linux
release    : 4.4.0-21-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 4
interpreter: 64bit


In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import time
from collections import Counter
import tensorflow as tf
import jieba
import jieba.posseg as pseg

## 读取

In [3]:
IGNORE = ' \n' # 忽略的字符
SEN_LENGTH = 20 # 句子预设长度
#PADDING = '<PD>' # 句子长度不足时的占位符
LIMIT = 3 # 构建词表时，选择词汇的最低频率

In [4]:
def cutset(s):
    words = []
    for w,f in pseg.cut(s.strip()):
        if f == 'x' and f == 'm' and (w in IGNORE):
            continue
        words.append(w)
    return words

In [5]:
def read_file(file):
    with open(file, 'r') as f:
        dataset = []
        rawdata = f.read().decode('utf-8')
        data = rawdata.split('\n')
        for w in data:
            dataset.append(w.split('\t'))
    
        sensetcut = []
        for i in range(len(dataset)-1):
            tmp = [cutset(dataset[i][0]), int(dataset[i][1])]
            sensetcut.append(tmp)
    
    return sensetcut

In [6]:
def get_vocab(sensetcut):
    words = []
    for s in sensetcut:
        for w in s[0]:
            words.append(w)
    word_cnt = Counter(words)
    #vocab = [i[0] for i in word_cnt.most_common(vocab_size - 1)] # 采用固定长度 vocab_size，也可以采用 word_cnt 不低于某值
    vocab = [i[0] for i in word_cnt.most_common() if (i[1] > LIMIT) ]
    vocab.insert(0, 'UNK')
    return vocab

In [7]:
def get_data(sensetcut, vocab):

    train_ids = []
    label_ids = []
    inputs = np.zeros(len(sensetcut)*SEN_LENGTH).reshape(len(sensetcut), SEN_LENGTH)
    labels = np.zeros(len(sensetcut))
    for i in range(len(sensetcut)):
        if len(sensetcut[i][0]) < 1:
            continue
        tmp1 = np.array([[vocab.index(word) if (word in vocab) else 0 for word in sensetcut[i][0]]])
        if tmp1.shape[1] < SEN_LENGTH:
            # 也可以用 extand：docs[i].extend([PADDING] * (DOC_LENGTH - len(docs[i])))
            tmp2 = np.array([np.zeros(SEN_LENGTH-tmp1.shape[1])])
            tmp = np.hstack((tmp1, tmp2))
        else:
            tmp = tmp1[0, 0:SEN_LENGTH]
        inputs[i] = tmp
        labels[i] = sensetcut[i][1]
    return inputs, labels

In [8]:
sensetcut_train = read_file('../w5_CNN/train_shuffle.txt')
sensetcut_test = read_file('../w5_CNN/test_shuffle.txt')

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.364 seconds.
Prefix dict has been built succesfully.


In [9]:
vocab_train = get_vocab(sensetcut_train)

In [84]:
inputs_train, labels_train = get_data(sensetcut_train, vocab_train)
inputs_test, labels_test = get_data(sensetcut_test, vocab_train)

In [85]:
inputs_train.shape, labels_train.shape, inputs_test.shape, labels_test.shape

((24586, 20), (24586,), (10538, 20), (10538,))

In [47]:
#labels_train = labels_train.reshape(len(labels_train),1)
#labels_test = labels_test.reshape(len(labels_test),1)

## 训练

In [12]:
vocab_size = len(vocab_train)
word_embed_size = 32
filter_num = 64
window_size = 3
num_fc_hidden = 10
num_labels = 2

In [76]:
tf.reset_default_graph()

In [77]:
W = tf.Variable(
                tf.random_uniform([vocab_size, word_embed_size], -1.0, 1.0),
                name="W")

inputs = tf.placeholder(tf.int32, shape=[None, SEN_LENGTH], name='inputs')
labels = tf.placeholder(tf.int32, shape=[None], name='labels')

In [78]:
embeds = tf.nn.embedding_lookup(W, inputs)
embeds_expand = tf.expand_dims(embeds, -1)

In [79]:
# max_pool
with tf.name_scope("conv-maxpool"):

    filter_shape = [window_size, word_embed_size, 1, filter_num]
    # W 和 b 是卷积的参数
    W = tf.Variable(tf.random_uniform(filter_shape, -1.0, 1.0), name="W")
    # bias 和 filter_num 个数是一样的
    b = tf.Variable(tf.constant(0.0, shape=[filter_num]), name="b")
    # 步长为1，这里不做 Padding，因此句子太短的话可能要丢掉
    # 原始语料已经做了 Padding
    conv = tf.nn.conv2d(
                    embeds_expand,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
    # 卷积出来的结果加上 bias
    conv_hidden = tf.nn.tanh(tf.add(conv, b), name="tanh")

    # 因为没有 padding，出来的结果个数是 sequence_length - window_size + 1，如果加了 padding 这里要对应更改。
    pool = tf.nn.max_pool(
                    conv_hidden,
                    ksize=[1, SEN_LENGTH - window_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")

In [80]:
fc = tf.layers.dense(pool, num_fc_hidden, activation=tf.nn.tanh)

squeezed_pool = tf.squeeze(fc, [1, 2]) 
raw_output = tf.layers.dense(squeezed_pool, num_labels)

#raw_output = tf.layers.dense(fc, num_labels, name='output')
output = tf.nn.softmax(raw_output)

In [81]:
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=raw_output, labels=labels))

In [82]:
def evaluate_model(sess, inputs_, labels_, print_matrix=False):
    """评估模型指标，并打印输出"""
    pred_prob = sess.run(output, feed_dict={inputs:inputs_, labels:labels_})
    preds = np.asarray((pred_prob[:, 1] > 0.5), dtype=int)
    mat = sess.run(tf.confusion_matrix(labels_, preds))
    tn, fp, fn,  tp = mat.reshape(4)
    precision = np.float(tp) / (tp + fp)
    recall = np.float(tp) / (tp + fn)
    if print_matrix:
        print('confusion matrix:\n', mat)
    print('precision:% .3f, recall: %.3f' %(precision, recall))

In [86]:
learning_rate = 1
batch_size = 125
epochs = 3000
print_cost_every = 125

feed_train = {inputs: inputs_train, labels: labels_train}
feed_test = {inputs: inputs_test, labels: labels_test}

sess = tf.Session()
sess.run(tf.global_variables_initializer())
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

costs_train = []
costs_test = []
start_time = time.time()

num_inputs = len(labels_train)
order = np.arange(num_inputs)
np.random.shuffle(order)

try:
    for i in range(epochs):
        if i % print_cost_every == 0:
            cost_train = sess.run(cost, feed_dict=feed_train)
            cost_test = sess.run(cost, feed_dict=feed_test)
            print('Epoch %d cost: train: %s / test: %s' %(i, cost_train, cost_test))
            costs_train.append(cost_train)
            costs_test.append(cost_test)
            evaluate_model(sess, inputs_test, labels_test)
        for j in range(0, num_inputs, batch_size):
            batch_index = order[j: j + batch_size]
            batch_inputs = inputs_train[batch_index]
            batch_labels = labels_train[batch_index]
            batch_feed = {inputs: batch_inputs, labels: batch_labels}
            sess.run(train_step, feed_dict=batch_feed)
except KeyboardInterrupt:
    print('Interrupted')
finally:
    end_time = time.time()
    print('\ntime: %s' % (end_time - start_time))

Epoch 0 cost: train: 0.797152 / test: 0.796957
precision: 0.471, recall: 0.999
Epoch 125 cost: train: 0.033436 / test: 0.588153
precision: 0.893, recall: 0.884
Epoch 250 cost: train: 0.0218082 / test: 0.570705
precision: 0.905, recall: 0.880
Epoch 375 cost: train: 0.00498927 / test: 0.740818
precision: 0.914, recall: 0.885
Epoch 500 cost: train: 0.00424016 / test: 0.811011
precision: 0.907, recall: 0.888
Epoch 625 cost: train: 0.00401673 / test: 0.857259
precision: 0.900, recall: 0.892
Epoch 750 cost: train: 0.00396796 / test: 0.8719
precision: 0.903, recall: 0.892
Epoch 875 cost: train: 0.00394944 / test: 0.891875
precision: 0.904, recall: 0.892
Epoch 1000 cost: train: 0.00393985 / test: 0.903336
precision: 0.903, recall: 0.893
Epoch 1125 cost: train: 0.00393795 / test: 0.910647
precision: 0.902, recall: 0.894
Epoch 1250 cost: train: 0.00393274 / test: 0.924075
precision: 0.902, recall: 0.895
Epoch 1375 cost: train: 0.0039272 / test: 0.937047
precision: 0.902, recall: 0.896
Epoch 1500

### 调整 batch

In [None]:
learning_rate = 1
batch_size = 1
epochs = 3000
print_cost_every = 100

feed_train = {inputs: inputs_train, labels: labels_train}
feed_test = {inputs: inputs_test, labels: labels_test}

sess = tf.Session()
sess.run(tf.global_variables_initializer())
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

costs_train = []
costs_test = []
start_time = time.time()

num_inputs = len(labels_train)
order = np.arange(num_inputs)
np.random.shuffle(order)

try:
    for i in range(epochs):
        if i % print_cost_every == 0:
            cost_train = sess.run(cost, feed_dict=feed_train)
            cost_test = sess.run(cost, feed_dict=feed_test)
            print('Epoch %d cost: train: %s / test: %s' %(i, cost_train, cost_test))
            costs_train.append(cost_train)
            costs_test.append(cost_test)
            evaluate_model(sess, inputs_test, labels_test)
        for j in range(0, num_inputs, batch_size):
            batch_index = order[j: j + batch_size]
            batch_inputs = inputs_train[batch_index]
            batch_labels = labels_train[batch_index]
            batch_feed = {inputs: batch_inputs, labels: batch_labels}
            sess.run(train_step, feed_dict=batch_feed)
except KeyboardInterrupt:
    print('Interrupted')
finally:
    end_time = time.time()
    print('\ntime: %s' % (end_time - start_time))

Epoch 0 cost: train: 0.872736 / test: 0.872346
precision: 0.471, recall: 1.000
Epoch 100 cost: train: 5.7949 / test: 5.79462
precision: 0.471, recall: 1.000
Epoch 200 cost: train: 5.79602 / test: 5.79575
precision: 0.471, recall: 1.000
Epoch 300 cost: train: 5.79667 / test: 5.79619
precision: 0.471, recall: 1.000
Epoch 400 cost: train: 5.79682 / test: 5.79655
precision: 0.471, recall: 1.000
Epoch 500 cost: train: 5.79685 / test: 5.79665
precision: 0.471, recall: 1.000
Epoch 600 cost: train: 5.79685 / test: 5.79668
precision: 0.471, recall: 1.000
Epoch 700 cost: train: 5.79685 / test: 5.79669
precision: 0.471, recall: 1.000
Epoch 800 cost: train: 5.79725 / test: 5.79719
precision: 0.471, recall: 1.000
Epoch 900 cost: train: 5.79725 / test: 5.79719
precision: 0.471, recall: 1.000
Epoch 1000 cost: train: 5.79725 / test: 5.79719
precision: 0.471, recall: 1.000
Epoch 1100 cost: train: 5.79725 / test: 5.79719
precision: 0.471, recall: 1.000
Epoch 1200 cost: train: 5.79725 / test: 5.79719
pre

### softmax 优化

In [91]:
cost = tf.reduce_mean(tf.nn.nce_loss(raw_output, labels=labels))

TypeError: nce_loss() takes at least 6 arguments (2 given)

In [None]:
learning_rate = 1
batch_size = 125
epochs = 3000
print_cost_every = 125

feed_train = {inputs: inputs_train, labels: labels_train}
feed_test = {inputs: inputs_test, labels: labels_test}

sess = tf.Session()
sess.run(tf.global_variables_initializer())
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

costs_train = []
costs_test = []
start_time = time.time()

num_inputs = len(labels_train)
order = np.arange(num_inputs)
np.random.shuffle(order)

try:
    for i in range(epochs):
        if i % print_cost_every == 0:
            cost_train = sess.run(cost, feed_dict=feed_train)
            cost_test = sess.run(cost, feed_dict=feed_test)
            print('Epoch %d cost: train: %s / test: %s' %(i, cost_train, cost_test))
            costs_train.append(cost_train)
            costs_test.append(cost_test)
            evaluate_model(sess, inputs_test, labels_test)
        for j in range(0, num_inputs, batch_size):
            batch_index = order[j: j + batch_size]
            batch_inputs = inputs_train[batch_index]
            batch_labels = labels_train[batch_index]
            batch_feed = {inputs: batch_inputs, labels: batch_labels}
            sess.run(train_step, feed_dict=batch_feed)
except KeyboardInterrupt:
    print('Interrupted')
finally:
    end_time = time.time()
    print('\ntime: %s' % (end_time - start_time))