In [1]:
%matplotlib inline
from collections import Counter
import jieba
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
print(tf.__version__)

1.0.1


### 定义参数

In [78]:
tf.reset_default_graph()

In [79]:
# 决定了 embedding 的维度 （隐层节点数）
word_embedding_dim = 30
# 决定了词表数量, 预留一个未登录词
vocab_size = 400 + 1
UNK_IDX = 0

### 定义变量

In [80]:
word_embedding = tf.Variable(tf.random_uniform([vocab_size, word_embedding_dim]))
input_data = tf.placeholder(tf.int32, shape=[None, 2], name='input_data')
input_embeds = tf.nn.embedding_lookup(word_embedding, input_data)

### 词向量相加

In [81]:
context_embeds = tf.reduce_sum(input_embeds, axis=1)

### 映射到 N 个词的概率分布

In [82]:
# raw_output 是一个 vocab_size 维的数据，对比 labels 计算 cost
# 假设输入一组（也就是 两个词），输出因为词向量相加过了，所以就是一个词的词向量：one-hot？
raw_output = tf.layers.dense(context_embeds, vocab_size)
# 如果输入一组，输出的 softmax 是预测的 one-hot 的概率分布？最可能的那个输出词概率最大？
output = tf.nn.softmax(raw_output)

### cost

In [83]:
# 样本的 labels 也需要用 placeholder 放置
labels = tf.placeholder(tf.int32, shape=[None], name='labels')

# 因为我们每个样本的 label 只有一个，使用稠密的 softmax 算 cost 及求导太浪费了。这里使用 sparse 版本即可。
# 如果你的 label 是完整的 N 个词上的概率分布，这时候可以使用 tf.nn.softmax_cross_entropy_with_logits
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=raw_output, labels=labels)

In [84]:
# 记录器记录 cost
# scalar 标量，只能是一个值
# 实际 run 时才会记录
cost_summary = tf.summary.histogram('cost', cost)
context_embeds_summary = tf.summary.histogram('context_embeds', context_embeds)

# mearge 会自动收集 graph 上所有 summary 操作
merged = tf.summary.merge_all()

### 读取语料，生成训练数据

In [9]:
import jieba
import jieba.posseg as pseg
line_no = 0
words = []
with open('../AssisantEvaluate/xiyouji.txt', 'r') as f:
    for line in f.readlines():
        line_no += 1
        if line_no > 500:
            break
        word = pseg.cut(line.strip().decode('utf-8')) # 去掉末尾的 '\n'
        for w,f in word:
            if f == 'x':
                continue
            words.append(w)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.394 seconds.
Prefix dict has been built succesfully.


In [10]:
# 统计词频
word_cnt = Counter(words)

In [11]:
# 高频截断
vocab = [i[0] for i in word_cnt.most_common(vocab_size - 1)]

In [12]:
# 插入未登录词
vocab.insert(UNK_IDX, 'UNK')

In [13]:
len(vocab)

401

In [14]:
# 映射 id
word_ids = [vocab.index(word) if (word in vocab) else 0
            for word in words]

In [15]:
# 生成训练数据
inputs_train = np.asarray(
                [[word_ids[i-1], word_ids[i+1]] for i in range(1, len(word_ids) - 1)])
labels_train = np.asarray(word_ids[1:-1])

In [16]:
inputs_train.shape

(6152, 2)

In [17]:
labels_train.shape

(6152,)

### 训练模型

In [89]:
writer = tf.summary.FileWriter("./tf_log2")#, graph=tf.get_default_graph())

In [90]:
train_step = tf.train.GradientDescentOptimizer(0.0002).minimize(cost)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    dummy_feed_dict = {input_data: inputs_train,
                       labels: labels_train}
    for i in range(500):
        #sess.run(train_step, feed_dict=dummy_feed_dict)
        #_, cost_summary_run, context_embeds_summary_run = sess.run([train_step, cost_summary, context_embeds_summary], 
        #                                                           feed_dict=dummy_feed_dict)
        #writer.add_summary(cost_summary_run, i)
        #writer.add_summary(context_embeds_summary_run, i)
        
        _, merge_summary= sess.run([train_step, merged], feed_dict=dummy_feed_dict)
        writer.add_summary(merge_summary, i)
        
        if i % 50 == 0:
            print("Iteration %d" % i)
            
            # cost 情况
            cost_array = cost.eval(feed_dict=dummy_feed_dict)
            print("Cost 矩阵：")
            print(cost_array)
            print("Cost 矩阵 shape：")
            print(cost_array.shape)
            print('---')
            
            # Output 情况
            output_array = output.eval(feed_dict=dummy_feed_dict)
            print("Output 矩阵：")
            print(output_array)
            print("Output 矩阵 shape：")
            print(output_array.shape)
            print(output_array[0].shape)
            print('---')
            
            # 查看输出中 ID == 30 的概率            
            print("Probability: %f" % output_array[0, 30])
            print("------")
            # 词向量是 context_embeds 吗？
            allwords_embedding = context_embeds.eval(feed_dict=dummy_feed_dict)
            raw_outputs = raw_output.eval(feed_dict=dummy_feed_dict)

Iteration 0
Cost 矩阵：
[  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   1.80640697e+01
   2.27762260e+01   2.14576494e-06]
Cost 矩阵 shape：
(6152,)
---
Output 矩阵：
[[  1.00000000e+00   5.35232171e-11   8.17373183e-11 ...,   2.96555142e-11
    3.80226232e-11   7.48546017e-11]
 [  1.00000000e+00   5.35232171e-11   8.17373183e-11 ...,   2.96555142e-11
    3.80226232e-11   7.48546017e-11]
 [  1.00000000e+00   5.35232171e-11   8.17373183e-11 ...,   2.96555142e-11
    3.80226232e-11   7.48546017e-11]
 ..., 
 [  9.99993205e-01   2.19535377e-08   4.17109014e-08 ...,   2.08304751e-08
    1.71025860e-08   2.71710157e-08]
 [  1.00000000e+00   1.26094302e-10   2.40675951e-10 ...,   9.00574545e-11
    1.18625332e-10   1.82135973e-10]
 [  9.99997854e-01   6.56038379e-09   1.71953296e-08 ...,   7.27173166e-09
    5.47694334e-09   6.09611961e-09]]
Output 矩阵 shape：
(6152, 401)
(401,)
---
Probability: 0.000000
------
Iteration 50
Cost 矩阵：
[ 0.61527818  0.61527818  0.61527818 ...,  6.82514906  6.397

In [91]:
tf.summary.FileWriter("./tf_log2", graph=tf.get_default_graph())

<tensorflow.python.summary.writer.writer.FileWriter at 0x7eff273016d0>