## 导入相关库
- 得到训练所需的原属数据

In [19]:
import numpy as np
import tensorflow as tf
import os
os.chdir('C:/test/tensorflow/Dataset')
with open('reviews.txt') as file:
    reviews = file.read()
with open('labels.txt') as file:
    labels = file.read()
    

### 对原始数据进行预处理
- 去除数据中的无关的分隔符，得到更加干净的文本

In [20]:
from string import punctuation
all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')
all_text = ' '.join(reviews)
words = all_text.split()

### 简历检索字典

In [21]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key = counts.get, reverse = True)
vocab_to_int = {word:i for i, word in enumerate(vocab, 1)} # 用0对数据进行padding，从1开始
reviews_ints = []
for each in reviews:
    reviews_ints.append([vocab_to_int[word] for word in each.split()])

### 对labels进行处理

In [22]:
labels = labels.split('\n')
labels = np.array([1 if each =='positive' else 0 for each in labels])


### 删除文本中空数据
- 删除没用评论的样本

In [25]:
non_zero_idx = [i for i, review in enumerate(reviews_ints) if len(review) != 0]
reviews_ints = [reviews_ints[i] for i in non_zero_idx]
labels = np.array([labels[i] for i in non_zero_idx])

### 对数据进行padding
- 对少于200的字符进行0padding, 对长于200进行截取

In [27]:
seq_len = 200
features = np.zeros((len(reviews_ints), seq_len), dtype = np.int32)
for i, row in enumerate(reviews_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]
    

### 对数据进行切分
- Training, Validation, Test

In [28]:
split_frac = 0.8
split_idx = int(len(features) * split_frac)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]
test_idx = int(len(val_x) * 0.5)
val_x, test_x= val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

## Build the Graph
### 定义超参数和定义Variable

In [45]:
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.001
n_words = len(vocab_to_int) + 1
graph = tf.Graph()
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name = 'inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name = 'labels')
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')

### Embedding

In [46]:
embed_size = 300
with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)
    

### LSTM

In [None]:
with graph.as_default():
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob) # dropout
    cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    initial_state = cell.zero_state(batch_size, tf.float32)# 初始状态

### RNN Forward Pass
- `outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state)`

In [58]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, 
                                            embed, initial_state = initial_state)
    

### Output
- grab the last output with `outputs[:, -1]`, the calculate the cost from that and labels_

In [59]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(
outputs[:, -1], 1, activation_fn = tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    optimizer = tf.train.AdamOptimizer(learning_rate= learning_rate).minimize(cost)

In [60]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(
    tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(
    correct_pred, tf.float32))

In [63]:
def get_batches(x, y ,batch_size = batch_size):
    n_batches = len(x)//batch_size
    x, y = x[:n_batches * batch_size], y[:n_batches * batch_size]
    for i in range(0, len(x), batch_size):
        yield x[i:i+batch_size], y[i:i + batch_size]
        

### Training

In [None]:
epochs = 10
with graph.as_default():
    saver = tf.train.Saver()
with tf.Session(graph = graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        for i, (x, y) in enumerate(get_batches(train_x, train_y)):
            feed = {inputs_:x, 
                   labels_: y[:, None], 
                   keep_prob: 0.5, 
                   initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict = feed)
            if iteration % 5 ==0:
                print('Epoch:{}/{}'.format(e, epochs), 
                     'Iteration:{}'.format(iteration), 
                     'Train loss:{:.3f}'.format(loss))
            if iteration % 25 == 0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_:x, 
                           labels_: y[:, None], 
                           keep_prob:1, 
                           initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict = feed)
                    val_acc.append(batch_acc)
                print('Val acc: {:.3f}'.format(np.mean(val_acc)))
            iteration += 1
        saver.save(sess, 'checkpoints/setiment.ckpt')            

Epoch:0/10 Iteration:5 Train loss:0.245
Epoch:0/10 Iteration:10 Train loss:0.238
Epoch:0/10 Iteration:15 Train loss:0.220
Epoch:0/10 Iteration:20 Train loss:0.189
Epoch:0/10 Iteration:25 Train loss:0.221
Val acc: 0.695
Epoch:0/10 Iteration:30 Train loss:0.205
Epoch:0/10 Iteration:35 Train loss:0.182


### Testing

In [None]:
test_acc = []
with tf.Session(graph = graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for i, (x, y) in enumerate(get_batches(test_x, test_y, batch_size)):
        feed = {inputs_:x, 
               labels_: y, 
               keep_prob: 1, 
               initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict = feed)
        test_acc.append(batch_acc)
    print('Test accuracy: {:.3f}'.format(np.mean(test_acc)))