In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import tensorflow as tf
from collections import Counter
import jieba
import jieba.posseg as pseg
print(tf.__version__)

1.0.1


# 作业1

- 用神经网络建模，处理之前的情感分类问题

## 获得词向量

### 定义参数

In [103]:
# 决定了 embedding 的维度 （隐层节点数）
word_embedding_dim = 128
# 决定了词表数量, 预留一个未登录词
vocab_size = 10000 + 1
UNK_IDX = 0

In [104]:
word_embedding = tf.Variable(tf.random_uniform([vocab_size, word_embedding_dim]))
input_data = tf.placeholder(tf.int32, shape=[None, 833], name='input_data')
input_embeds = tf.nn.embedding_lookup(word_embedding, input_data)
context_embeds = tf.reduce_sum(input_embeds, axis=1)

In [105]:
input_embeds

<tf.Tensor 'embedding_lookup_1:0' shape=(?, 833, 128) dtype=float32>

In [106]:
word_embedding

<tensorflow.python.ops.variables.Variable at 0x7f09beec2110>

In [107]:
# raw_output 是一个 vocab_size 维的数据，对比 labels 计算 cost
# 假设输入一组（也就是 两个词），输出因为词向量相加过了，所以就是一个词的词向量：one-hot？
raw_output = tf.layers.dense(context_embeds, vocab_size)
# 如果输入一组，输出的 softmax 是预测的 one-hot 的概率分布？最可能的那个输出词概率最大？
output = tf.nn.softmax(raw_output)

In [108]:
# 样本的 labels 也需要用 placeholder 放置
labels = tf.placeholder(tf.int32, shape=[None], name='labels')

# 因为我们每个样本的 label 只有一个，使用稠密的 softmax 算 cost 及求导太浪费了。这里使用 sparse 版本即可。
# 如果你的 label 是完整的 N 个词上的概率分布，这时候可以使用 tf.nn.softmax_cross_entropy_with_logits
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=raw_output, labels=labels)

### 训练集

In [22]:
SEN_LENGTH = 50
vocab_size = 10001
LIMIT = 3

In [119]:
def cutset(s):
    words = []
    for w,f in pseg.cut(s.strip()):
        if f == 'x':
            continue
        words.append(w)
    return words

In [120]:
def read_file(file):
    with open(file, 'r') as f:
        dataset = []
        rawdata = f.read().decode('utf-8')
        data = rawdata.split('\n')
        for w in data:
            dataset.append(w.split('\t'))
    
        sensetcut = []
        for i in range(len(dataset)-1):
            tmp = [cutset(dataset[i][0]), int(dataset[i][1])]
            sensetcut.append(tmp)
    
    return sensetcut

In [121]:
def get_vocab(sensetcut):
    words = []
    for s in sensetcut:
        for w in s[0]:
            words.append(w)
    word_cnt = Counter(words)
    #vocab = [i[0] for i in word_cnt.most_common(vocab_size - 1)] # 采用固定长度 vocab_size，也可以采用 word_cnt 不低于某值
    vocab = [i[0] for i in word_cnt.most_common() if (i[1] > LIMIT) ]
    vocab.insert(0, 'UNK')
    return vocab

In [122]:
def get_data(sensetcut, vocab):

    train_ids = []
    label_ids = []
    inputs = np.zeros(len(sensetcut)*SEN_LENGTH).reshape(len(sensetcut), SEN_LENGTH)
    labels = np.zeros(len(sensetcut))
    for i in range(len(sensetcut)):
        if len(sensetcut[i][0]) < 1:
            continue
        tmp1 = np.array([[vocab.index(word) if (word in vocab) else 0 for word in sensetcut[i][0]]])
        if tmp1.shape[1] < SEN_LENGTH:
            tmp2 = np.array([np.zeros(SEN_LENGTH-tmp1.shape[1])])
            tmp = np.hstack((tmp1, tmp2))
        else:
            tmp = tmp1[0, 0:SEN_LENGTH]
        inputs[i] = tmp
        labels[i] = sensetcut[i][1]
    return inputs, labels

In [None]:
sensetcut_train = read_file('./train_shuffle.txt')
sensetcut_test = read_file('./test_shuffle.txt')

vocab_train = get_vocab(sensetcut_train)

In [None]:
inputs_train, labels_train = get_data(sensetcut_train, vocab_train)

In [129]:
inputs_test, labels_test = get_data(sensetcut_test, vocab_train)

In [130]:
inputs_train.shape

(24586, 50)

In [131]:
inputs_test.shape

(10538, 50)

In [138]:
train_data = np.hstack((inputs_train, labels_train.reshape(len(inputs_train),1)))

In [139]:
test_data = np.hstack((inputs_test, labels_test.reshape(len(inputs_test),1)))

In [135]:
from random import shuffle

### 训练模型

In [135]:
shuffle(train_data)

In [141]:
train_step = tf.train.GradientDescentOptimizer(0.00002).minimize(cost)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for i in range(50000):
        #shuffle(train_data)
        temp = np.random.randint(0, len(train_data)-100)
        dummy_feed_dict = {input_data: train_data[temp:temp+10][:,:-1],
                       labels: train_data[temp:temp+10][:,-1]}
#         dummy_feed_dict = {input_data: inputs_trains[0:5],
#                         labels: labels_trains[0:5]}
        sess.run(train_step, feed_dict=dummy_feed_dict)
        if i % 5000 == 0:
            print("Iteration %d" % i)
            print("Cost 矩阵：")
            print(cost.eval(feed_dict=dummy_feed_dict))
            print("cost 长度：%d" % len(cost.eval(feed_dict=dummy_feed_dict)))
            # 查看输出中 ID == 30 的概率
            print("Output 长度: %d" % len(output.eval(feed_dict=dummy_feed_dict)[0]))
            print(output.eval(feed_dict=dummy_feed_dict).shape)
            print(output.eval(feed_dict=dummy_feed_dict)[0].shape)
            print("Output 矩阵：")
            print(output.eval(feed_dict=dummy_feed_dict))
            print("Probability: %f" % output.eval(feed_dict=dummy_feed_dict)[1, 30])
            print("------")
            # 词向量是 context_embeds 吗？
            allwords_embedding = context_embeds.eval(feed_dict=dummy_feed_dict)
            test = raw_output.eval(feed_dict=dummy_feed_dict)

Iteration 0
Cost 矩阵：
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
cost 长度：10
Output 长度: 10001
(10, 10001)
(10001,)
Output 矩阵：
[[ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]]
Probability: 0.000000
------
Iteration 5000
Cost 矩阵：
[  6.98424768e+00   5.61241579e+00   0.00000000e+00   5.81346226e+00
   1.10929072e-01   3.57621466e-05   4.76837045e-07   8.81045242e-04
   1.32185787e-01   7.41669655e+00]
cost 长度：10
Output 长度: 10001
(10, 10001)
(10001,)
Output 矩阵：
[[  9.26359906e-04   9.99072909e-01   2.10453657e-13 ...,   4.96110444e-12
    7.34816860e-11   5.46131716e-12]
 [  3.65223526e-03   9.96347845e-01   2.23020366e-15 ...,   7.24525420e-14
    6.90804293e-13   6.98540347e-14]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 ..., 
 [  8.80279986e-04   9.99119341e-01   1.2982114

#### 作业2

- 将 NN Language Model 的 cost 下降情况用 summary + tensorboard 的方式进行可视化
- 收集 Word embedding 的 histogram 并进行可视化

见 task2

#### 作业3

- 使用一个能够平滑图像的二维卷积对图像进行处理
- 应用卷积神经网络实现情感分类
  - 标准化格式，能够读取格式固定的训练、测试数据，跑出 Accuracy 和 Confusion Matrix 结果

见 task3