In [1]:
import tensorflow as tf
import tarfile
import os
import numpy as np
import random
import datetime

print("{}".format(datetime.datetime.now()))

2018-11-28 15:44:35.966320


# 下载文件，解压

In [2]:
URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"


def generate_samples(data_dir, tmp_dir):
 imdb_dir = os.path.join(tmp_dir, "aclImdb")
 if not tf.gfile.Exists(imdb_dir):
   with tarfile.open(data_dir, "r:gz") as tar:
     tar.extractall(tmp_dir)
 return imdb_dir
        
  
def doc_generator(imdb_dir, dataset, include_label=False):
  dirs = [(os.path.join(imdb_dir, dataset, "pos"), True), (os.path.join(
      imdb_dir, dataset, "neg"), False)]

  for d, label in dirs:
    for filename in os.listdir(d):
      with tf.gfile.Open(os.path.join(d, filename)) as imdb_f:
        doc = imdb_f.read().strip()
        if include_label:
          yield doc, label
        else:
          yield doc

In [3]:
imdb_file = tf.keras.utils.get_file(URL.split("/")[-1], URL)
doc_dirs = generate_samples(imdb_file, "tmp")
max_sent_len = 500
doc_label_train = [(doc[:max_sent_len].lower(), label) for doc, label in doc_generator(doc_dirs,"train", True)]
doc_label_test = [(doc[:max_sent_len].lower(), label) for doc, label in doc_generator(doc_dirs,"test", True)]

# 生成词典，token id化
重要使用tf.keras.preprocesssing.text.Tokenizer来进行ID化，这个类资料比较少，不过直接看[源码](https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py)也很容易。有个doc可以看下[Tokenizer
 文档](https://faroit.github.io/keras-docs/1.2.2/preprocessing/text/)

In [4]:
import collections
class Tokenizer:
    def __init__(self, 
                 num_words, 
                 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                 split=' ',
                 predifine_token = ['<UNK>', '<GO>', '<PAD>', '<EOS>']):
        self.num_words = num_words
        self.filters = filters
        self.split = split
        self.word_index = {}
        self.words = []
        self.index_word = {}
        self.counter = collections.Counter()
        self.predifine_token = predifine_token
        
        translate_dict = dict((c, split) for c in filters)
        self.translate_map = str.maketrans(translate_dict)
        self.words = self.predifine_token
        
    def __split_text(self, text):
        parts = text.lower().replace("<br />", " ").translate(self.translate_map).split(self.split)
        return [p for p in parts if p]
    
    def fit_on_texts(self, texts):
        self.words = self.predifine_token.copy()
        for doc in texts:
            parts = self.__split_text(doc)
            self.counter.update(parts)
        
        for word,count in self.counter.most_common(self.num_words - len(self.words)):
            self.words.append(word)
        # print(self.words)
        self.word_index = {w:i for i, w in enumerate(self.words)}
        # print(self.word_index)
        self.index_word = dict([(i,w) for w, i in self.word_index.items()])

        # print(self.index_word)
        

    def texts_to_sequences(self, texts):
        unknown = self.word_index.get("<UNK>")
        ret = []
        for text in texts:
            parts = self.__split_text(text)
            seq = [self.word_index.get(word, unknown) for word in parts]
            ret.append(seq)
        return ret
        

def Test_Tokenizer():
    token = Tokenizer(500)
    token.fit_on_texts(["is's is a test. Hass Test"])
    token.fit_on_texts(["is's is a test. Hass2 Test"])
    ret = token.texts_to_sequences(["is's is a test. sabs, test"])
    print(ret)
Test_Tokenizer()   

[[5, 6, 7, 4, 0, 4]]


In [5]:
vocab_size = 15000

def init_tokenizer(docs):
  #tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<UNK>")
  tokenizer = Tokenizer(vocab_size)
  tokenizer.fit_on_texts(docs)
  # 支持很多方法，可以参考源码来使用
  # tokenizer.to_json()
  # tokenizer.word_docs 
  # tokenizer.word_counts
#   tokenizer.word_index.items()
#   tokenizer.texts_to_sequences([doc_label_test[0][0]])
  return tokenizer

docs, _ = zip(*doc_label_train)
tokenizer = init_tokenizer(docs)
# list(tokenizer.word_index.items())
list(tokenizer.index_word.items())[:10]

[(0, '<UNK>'),
 (1, '<GO>'),
 (2, '<PAD>'),
 (3, '<EOS>'),
 (4, 'the'),
 (5, 'a'),
 (6, 'and'),
 (7, 'of'),
 (8, 'to'),
 (9, 'is')]

In [6]:
print(tokenizer.num_words)

15000


# PAD
训练时每个语句的长短不一致，需要将句子补齐到一致的长度


tokenizer.texts_to_sequences 用于把包含多个文本串的list，转换成对应的id化的list

In [7]:
def data_to_ids(doc_label, tokenizer, max_len):
#   docs = [doc for doc, label in doc_label]
#   labels = [label for doc, label in doc_label]
  docs, labels = zip(*doc_label)
  
  print(type(docs))
  docs_ids = tokenizer.texts_to_sequences(docs)
  docs_ids_pad = tf.keras.preprocessing.sequence.pad_sequences(docs_ids, maxlen = max_len, 
                                              dtype="int32", 
                                              padding = "post",
                                             value=2)
  labels_ids = [[1, 0] if lbl else [0, 1] for lbl in labels]
  lengths = [len(doc) for doc in docs_ids]
  # print(lengths)
  return zip(docs_ids_pad, labels_ids, lengths)

In [8]:

doc_ids_train = list(data_to_ids(doc_label_train, tokenizer, max_sent_len))
doc_ids_test = list(data_to_ids(doc_label_test, tokenizer, max_sent_len))



<class 'tuple'>
<class 'tuple'>


In [9]:
from functools import reduce
print(doc_label_train[11][0])
xx = ""
txt = ""
for x in doc_ids_train[11][0]:
    if x != 2:
        xx += str(x) + " "
        txt += tokenizer.index_word.get(x) + " "

print(xx)
print(txt)

as much as i have enjoyed the hanzo the razor movies, three is definitely enough: 'who's got the gold?', the final adventure for the japanese lawman with the impressive package, is a fairly enjoyable piece of pinku cinema, but offers little new in terms of ideas whilst taking a big step backwards as far as outrageousness is concerned.<br /><br />the film opens with the appearance of a female ghost, and looks as though it is going to explore supernatural territory, something which might have take
17 79 17 10 24 370 4 7023 4 5315 73 269 9 438 230 0 165 4 1783 846 4 731 955 21 4 882 12348 19 4 1309 5529 9 5 992 687 381 7 0 414 20 1712 117 147 12 1409 7 988 1921 700 5 178 1651 6424 17 229 17 0 9 2216 4 18 2076 19 4 1549 7 5 746 1123 6 324 17 176 13 9 168 8 3548 2533 3373 152 74 270 24 212 
as much as i have enjoyed the hanzo the razor movies three is definitely enough <UNK> got the gold ' the final adventure for the japanese lawman with the impressive package is a fairly enjoyable piece of

# 开始定义网络结构

训练参数

In [10]:
embedding_size = 100
hidden_unit_num = 150
batch_size = 32
epoch_count = 50
n_classes = 2
eval_batch = 500
show_batch = 50

# 第一版的实现
不能收敛

In [None]:


graph = tf.Graph()
with graph.as_default():
    inputX = tf.placeholder(tf.int32, shape = [None, max_sent_len])
    inputY = tf.placeholder(tf.float32, [None, n_classes])
    embeddings = tf.get_variable("word_embeddings", [vocab_size, embedding_size])
    data_embedding = tf.nn.embedding_lookup(embeddings, inputX)
    lstm = tf.nn.rnn_cell.LSTMCell(hidden_unit_num)
    rnn_output, rnn_state = tf.nn.dynamic_rnn(lstm, data_embedding, dtype=tf.float32)
    FC_W = tf.get_variable("fc_w", shape=[hidden_unit_num, n_classes])
    FC_B = tf.get_variable("fc_b", shape=[n_classes])
    logits = tf.sigmoid(tf.matmul(rnn_state.h, FC_W) + FC_B)
    model_Y = tf.nn.softmax(logits)
    loss = -tf.reduce_mean(inputY * tf.log(model_Y))
    optimizer = tf.train.AdagradOptimizer(0.3)
    train_op = optimizer.minimize(loss)
    init_op = tf.global_variables_initializer()
    predict_ret = tf.argmax(model_Y, axis=1)
    true_ret = tf.argmax(inputY, axis = 1)
#     print(predict_ret)
#     print(true_ret)
#     int_ret = (predict_ret == true_ret)
    accurcy, up_op = tf.metrics.accuracy(true_ret, predict_ret)

def train_raw(tfgraph, batch_size, epoch_count, doc_ids_train, doc_ids_test):
    train_data_len = len(doc_ids_train)
    batch_count = train_data_len//batch_size
    sess = tf.Session(graph = tfgraph)
    sess.run(init_op)
    for epoch in range(epoch_count):
      random.shuffle(doc_ids_train)
      for batch_index in range(batch_count):
        start = batch_index * batch_size
        inX, inY, xlen = zip(*doc_ids_train[start:start+batch_size])
    #     inX = [np.array(x) for x in inX]
    #     inY = [np.array(y) for y in inY]
        # print(batch_index)
        _, ret = sess.run([train_op, loss], feed_dict={inputX:inX, inputY:inY})

        if batch_index % 50 == 0:
          print("{} \t epoch:{}\t batch_index:{}\t loss:{}".format(datetime.datetime.now(),epoch, batch_index, ret))
        
        if batch_index % 500 == 0:
            random.shuffle(doc_ids_test)
            test_data = doc_ids_test[:10]
            test_inX, test_inY, test_xlen = zip(*test_data)
            py = sess.run([predict_ret], feed_dict = {inputX:test_inX})
            y = np.argmax(test_inY, 1)
            accuracy = np.sum(py == y) / len(y)
            print("{} \t epoch:{} \t batch_index:{} \t accuracy:{}".format(datetime.datetime.now(),epoch, batch_index, accuracy)) 

train_raw(graph, batch_size, epoch_count, doc_ids_train, doc_ids_test)

2018-11-28 15:44:46.665128 	 epoch:0	 batch_index:0	 loss:0.35230016708374023
2018-11-28 15:44:46.842621 	 epoch:0 	 batch_index:0 	 accuracy:0.6
2018-11-28 15:45:01.899348 	 epoch:0	 batch_index:50	 loss:0.34785908460617065
2018-11-28 15:45:17.093456 	 epoch:0	 batch_index:100	 loss:0.34630095958709717
2018-11-28 15:45:32.678048 	 epoch:0	 batch_index:150	 loss:0.3433159589767456
2018-11-28 15:45:48.093152 	 epoch:0	 batch_index:200	 loss:0.34873127937316895
2018-11-28 15:46:03.630770 	 epoch:0	 batch_index:250	 loss:0.3439143896102905
2018-11-28 15:46:18.830237 	 epoch:0	 batch_index:300	 loss:0.3478662073612213
2018-11-28 15:46:34.646806 	 epoch:0	 batch_index:350	 loss:0.3478981852531433
2018-11-28 15:46:50.126241 	 epoch:0	 batch_index:400	 loss:0.34836164116859436
2018-11-28 15:47:05.593406 	 epoch:0	 batch_index:450	 loss:0.3477240800857544
2018-11-28 15:47:20.987085 	 epoch:0	 batch_index:500	 loss:0.34738168120384216
2018-11-28 15:47:21.137271 	 epoch:0 	 batch_index:500 	 acc

2018-11-28 16:07:55.648590 	 epoch:5	 batch_index:650	 loss:0.3462136685848236
2018-11-28 16:08:10.856084 	 epoch:5	 batch_index:700	 loss:0.3464123606681824
2018-11-28 16:08:25.850326 	 epoch:5	 batch_index:750	 loss:0.34554994106292725
2018-11-28 16:08:35.157023 	 epoch:6	 batch_index:0	 loss:0.34676626324653625
2018-11-28 16:08:35.306441 	 epoch:6 	 batch_index:0 	 accuracy:0.3
2018-11-28 16:08:50.586829 	 epoch:6	 batch_index:50	 loss:0.3465060591697693
2018-11-28 16:09:05.558919 	 epoch:6	 batch_index:100	 loss:0.3470291495323181
2018-11-28 16:09:20.800670 	 epoch:6	 batch_index:150	 loss:0.3464221954345703
2018-11-28 16:09:35.858120 	 epoch:6	 batch_index:200	 loss:0.34614482522010803
2018-11-28 16:09:50.831974 	 epoch:6	 batch_index:250	 loss:0.3495182991027832
2018-11-28 16:10:06.021419 	 epoch:6	 batch_index:300	 loss:0.34661054611206055
2018-11-28 16:10:21.214183 	 epoch:6	 batch_index:350	 loss:0.34819406270980835
2018-11-28 16:10:36.255597 	 epoch:6	 batch_index:400	 loss:0

2018-11-28 16:31:04.597959 	 epoch:11	 batch_index:550	 loss:0.34743890166282654
2018-11-28 16:31:19.721689 	 epoch:11	 batch_index:600	 loss:0.34628498554229736
2018-11-28 16:31:35.112716 	 epoch:11	 batch_index:650	 loss:0.34618765115737915
2018-11-28 16:31:50.491794 	 epoch:11	 batch_index:700	 loss:0.3462604582309723
2018-11-28 16:32:05.739168 	 epoch:11	 batch_index:750	 loss:0.34633582830429077
2018-11-28 16:32:15.075721 	 epoch:12	 batch_index:0	 loss:0.3466365337371826
2018-11-28 16:32:15.217592 	 epoch:12 	 batch_index:0 	 accuracy:0.5
2018-11-28 16:32:30.516915 	 epoch:12	 batch_index:50	 loss:0.3508475720882416
2018-11-28 16:32:45.806698 	 epoch:12	 batch_index:100	 loss:0.347635418176651
2018-11-28 16:33:01.007105 	 epoch:12	 batch_index:150	 loss:0.3465837240219116
2018-11-28 16:33:16.030298 	 epoch:12	 batch_index:200	 loss:0.34658390283584595
2018-11-28 16:33:31.359286 	 epoch:12	 batch_index:250	 loss:0.3462586998939514
2018-11-28 16:33:46.514503 	 epoch:12	 batch_index

# 第二版的实现
问题：
1. 训练收敛太慢
1. 不收敛

改动
1. 增加了dynamic_rnn的sequence_length参数
1. FC层后的loss采用tf带的函数，不用自己写的
1. 准确率的计算放在graph中计算
1. （可能）vocab的大小改为12000， max——sent_len改为500，提升速度

上面的代码在运行时是不能收敛的，从[RNNs in Tensorflow, a Practical Guide and Undocumented Features](http://www.wildml.com/2016/08/rnns-in-tensorflow-a-practical-guide-and-undocumented-features/) 有一些dynamic_rnn的使用注意，我们可以修正下，在看看.
* 加了之后没有反应呀 * 

In [None]:
graph_seqlength = tf.Graph()
with graph_seqlength.as_default():
    inputX = tf.placeholder(tf.int32, shape = [None, max_sent_len])
    inputY = tf.placeholder(tf.float32, [None, n_classes])
    input_xlen = tf.placeholder(tf.int32, shape=[None])
    embeddings = tf.get_variable("word_embeddings", [vocab_size, embedding_size])
    data_embedding = tf.nn.embedding_lookup(embeddings, inputX)
    lstm = tf.nn.rnn_cell.LSTMCell(hidden_unit_num)
    # initial_state = lstm.zero_state(batch_size, tf.float32)
    rnn_output, rnn_state = tf.nn.dynamic_rnn(lstm, data_embedding, dtype=tf.float32,
                                             sequence_length = input_xlen)
    #,                                         initial_state = initial_state)
    FC_W = tf.get_variable("fc_w", shape=[hidden_unit_num, n_classes])
    FC_B = tf.get_variable("fc_b", shape=[n_classes])
    
    pred = tf.matmul(rnn_state.h, FC_W) + FC_B
    # logits = tf.sigmoid(pred)
    # 为什么不用sigmoid？
    # 不收敛，换了下loss函数
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=inputY))
    
    # model_Y = tf.nn.softmax(logits)
    # loss = -tf.reduce_mean(inputY * tf.log(model_Y))
    optimizer = tf.train.AdagradOptimizer(0.2).minimize(loss)
    # train_op = optimizer
    init_op = tf.global_variables_initializer()
    correctPred = tf.equal(tf.argmax(pred, axis=1), tf.argmax(inputY, axis = 1))
    accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

def train_graph_seqlength(tfgraph, batch_size, epoch_count, doc_ids_train, doc_ids_test):
    train_data_len = len(doc_ids_train)
    batch_count = train_data_len//batch_size
    sess = tf.Session(graph = tfgraph)
    sess.run(init_op)
    for epoch in range(epoch_count):
      random.shuffle(doc_ids_train)
      for batch_index in range(batch_count):
        start = batch_index * batch_size
        inX, inY, xlen = zip(*doc_ids_train[start:start+batch_size])
        _, ret,accu = sess.run([optimizer, loss, accuracy], feed_dict={inputX:inX, inputY:inY, input_xlen:xlen})
        # print(st.h[:,1])

        if batch_index % show_batch == 0:
          print("{} \t epoch:{}\t batch_index:{}\t loss:{} \t accuracy:{}".format(datetime.datetime.now(),epoch, batch_index, ret, accu))
        
        if batch_index % eval_batch == 0:
            random.shuffle(doc_ids_test)
            arr_accu = []
            for test_index in range(len(doc_ids_test) // batch_size):
                test_data = doc_ids_test[test_index:test_index + batch_size]
                test_inX, test_inY, test_xlen = zip(*test_data)
                accu = sess.run([accuracy], feed_dict = {inputX:test_inX, inputY:test_inY, input_xlen: test_xlen})
                arr_accu.append(accu)
            print("{} \t epoch:{} \t batch_index:{} \t accuracy:{}".format(datetime.datetime.now(),epoch, batch_index, np.mean(arr_accu)))

            

   

train_graph_seqlength(graph_seqlength, batch_size, epoch_count, doc_ids_train, doc_ids_test)

# 第三版
第二版的问题：
1. 收敛太慢
1. 测试集上的准确率在80%左右，训练集100%，有过拟合嫌疑，可以通过优化提升
1. 网络结构上可以在调整下，加深深度和改变结构

In [None]:
graph_v3 = tf.Graph()
with graph_v3.as_default():
    inputX = tf.placeholder(tf.int32, shape = [None, max_sent_len])
    inputY = tf.placeholder(tf.float32, [None, n_classes])
    input_xlen = tf.placeholder(tf.int32, shape=[None])
    embeddings = tf.get_variable("word_embeddings", [vocab_size, embedding_size])
    data_embedding = tf.nn.embedding_lookup(embeddings, inputX)
    lstm = tf.nn.rnn_cell.LSTMCell(hidden_unit_num)
    dropcell = tf.nn.rnn_cell.DropoutWrapper(lstm, output_keep_prob = 0.75, state_keep_prob = 0.75)
    rnn_output, rnn_state = tf.nn.dynamic_rnn(dropcell, data_embedding, dtype=tf.float32,
                                             sequence_length = input_xlen)
    FC_W = tf.get_variable("fc_w", shape=[hidden_unit_num, n_classes])
    FC_B = tf.get_variable("fc_b", shape=[n_classes])
    
    pred = tf.matmul(rnn_state.h, FC_W) + FC_B
    # logits = tf.sigmoid(pred)
    # 为什么不用sigmoid？
    # 不收敛，换了下loss函数
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=inputY))
    
    # model_Y = tf.nn.softmax(logits)
    # loss = -tf.reduce_mean(inputY * tf.log(model_Y))
    optimizer = tf.train.AdagradOptimizer(0.2).minimize(loss)
    # train_op = optimizer
    init_op = tf.global_variables_initializer()
    correctPred = tf.equal(tf.argmax(pred, axis=1), tf.argmax(inputY, axis = 1))
    accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

train_graph_seqlength(graph_v3, batch_size, epoch_count, doc_ids_train, doc_ids_test)

# 参考资料
* [Perform sentiment analysis with LSTMs, using TensorFlow](https://www.oreilly.com/learning/perform-sentiment-analysis-with-lstms-using-tensorflow)
* [一个dynamic_rnn的实现](https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/dynamic_rnn.py)
* [RNNs in Tensorflow, a Practical Guide and Undocumented Features](https://github.com/dennybritz/tf-rnn)
* 为什么不能用自己的交叉墒实现? 因为自己的softmax实现可能会有数据的溢出问题，np.exp(710)是一个inf [https://github.com/AliAbbasi/Numerically-Stable-Cross-Entropy-Loss-Function-Tensorflow](https://github.com/AliAbbasi/Numerically-Stable-Cross-Entropy-Loss-Function-Tensorflow)