In [2]:
import time
from collections import namedtuple
import numpy as np
import tensorflow as tf

In [6]:
with open('anna.txt', 'r') as file:
    text = file.read()
vocab = set(text)
vocab2int = {c:i for i,c in enumerate(vocab)}
int2vocab = dict(enumerate(vocab))
encoded = np.array([vocab2int[c] for c in text], dtype=np.int32)

In [7]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [8]:
encoded[:100]

array([47, 64, 65, 80, 45, 46, 36, 73, 20, 72, 72, 72, 75, 65, 80, 80, 78,
       73, 56, 65, 69, 81, 19, 81, 46, 41, 73, 65, 36, 46, 73, 65, 19, 19,
       73, 65, 19, 81, 55, 46, 58, 73, 46, 34, 46, 36, 78, 73, 70, 14, 64,
       65, 80, 80, 78, 73, 56, 65, 69, 81, 19, 78, 73, 81, 41, 73, 70, 14,
       64, 65, 80, 80, 78, 73, 81, 14, 73, 81, 45, 41, 73, 11, 74, 14, 72,
       74, 65, 78,  2, 72, 72,  3, 34, 46, 36, 78, 45, 64, 81, 14], dtype=int32)

In [9]:
len(vocab)

83

In [10]:
def get_batches(arr, n_seqs, num_steps):
    '''Create a generator that returns batches of size
       n_seqs x n_steps from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       n_seqs: Batch size, the number of sequences per batch
       n_steps: Number of sequence length(steps) per batch
    '''
    # Get the batch size and number of batches we can make
    batch_size = n_seqs*num_steps
    n_batches = len(arr)//batch_size
    
    # Keep only enough characters to make full batches
    arr = arr[:batch_size*n_batches]
    
    # Reshape into n_seqs rows
    arr = np.reshape(arr, [n_seqs,n_batches*num_steps])
    
    for n in range(0, arr.shape[1], num_steps):
        # The features
        x = arr[:n_seqs, 0+n*num_steps:num_steps+n*num_steps]
        # The targets, shifted by one
        y = np.zeros_like(x)
        y[:,:-1], y[:,-1] = x[:,1:], x[:,0]
        yield x, y

In [11]:
batches = get_batches(encoded, 10, 50)
x, y = next(batches)

In [12]:
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[47 64 65 80 45 46 36 73 20 72]
 [73 65 69 73 14 11 45 73 26 11]
 [34 81 14  2 72 72 22 12 46 41]
 [14 73 21 70 36 81 14 26 73 64]
 [73 81 45 73 81 41 43 73 41 81]
 [73 44 45 73 74 65 41 72 11 14]
 [64 46 14 73  7 11 69 46 73 56]
 [58 73  8 70 45 73 14 11 74 73]
 [45 73 81 41 14 42 45  2 73 57]
 [73 41 65 81 21 73 45 11 73 64]]

y
 [[64 65 80 45 46 36 73 20 72 72]
 [65 69 73 14 11 45 73 26 11 81]
 [81 14  2 72 72 22 12 46 41 43]
 [73 21 70 36 81 14 26 73 64 81]
 [81 45 73 81 41 43 73 41 81 36]
 [44 45 73 74 65 41 72 11 14 19]
 [46 14 73  7 11 69 46 73 56 11]
 [73  8 70 45 73 14 11 74 73 41]
 [73 81 41 14 42 45  2 73 57 64]
 [41 65 81 21 73 45 11 73 64 46]]


# Building the model
<br>
# RNN Input

In [22]:
def build_inputs(batch_size, num_steps):
    ''' Define placeholders for inputs, targets, and dropout 
    
        Arguments
        ---------
        batch_size: Batch size, number of sequences per batch
        num_steps: Number of sequence steps in a batch
        
    '''
    # Declare placeholders we'll feed into the graph
    inputs = tf.placeholder(tf.int32, shape=(batch_size,num_steps), name='inputs')
    targets = tf.placeholder(tf.int32, shape=(batch_size,num_steps), name='targets')
    
    # Keep probability placeholder for drop out layers
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    return inputs, targets, keep_prob

# LSTM Cell

In [25]:
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    ''' Build LSTM cell.
    
        Arguments
        ---------
        keep_prob: Scalar tensor (tf.placeholder) for the dropout keep probability
        lstm_size: Size of the hidden layers in the LSTM cells
        num_layers: Number of LSTM layers
        batch_size: Batch size 用来生成initial_state, 此例中详见下图

    '''
    ### Build the LSTM Cell
    # Use a basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell outputs
    drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob = keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([drop for _ in range(num_layers)])
    
    # batch_size: 一个batch的总字符数
    initial_state = cell.zero_state(batch_size, tf.float32) 
    
    return cell, initial_state

## RNN Output

In [109]:
def build_output(lstm_output, in_size, out_size):
    ''' Build a softmax layer, return the softmax output and logits.
    
        Arguments
        ---------
        
        lstm_output: List of output tensors from the LSTM layer
        # outputs: [batch_size, max_time(n_steps), hidden units(lstm_size)]
        in_size: Size of the input tensor, for example, size of the LSTM cells 
        # lstm_size 
        out_size: Size of this softmax layer
        # softmax layer应该和text中出现的字母种类数保持一致, num_classes
    '''

    # Reshape output so it's a bunch of rows, one row for each step for each sequence.
    # Concatenate lstm_output over axis 1 (the columns)
    print('\n lstm_output ==>\n',lstm_output)
    seq_output = tf.concat(lstm_output, axis=1)
    print('\n seq_output ==>\n',seq_output)
    # Reshape seq_output to a 2D tensor with lstm_size columns
    # reshape中的-1表示推断, 即给出其中一个维度参数的情况下, 推断出另一个维度应该是多少
    # in_size表示LSTM cell的个数, 这一步的输出应该是每层cell的输出列成一列
    x = tf.reshape(seq_output, [-1,in_size]) 
    print('\n seq_output.reshape\n',x)
    # Connect the RNN outputs to a softmax layer
    with tf.variable_scope('softmax'):
        # Create the weight and bias variables here
        softmax_w = tf.Variable(tf.truncated_normal([in_size,out_size], stddev=0.1, dtype=tf.float32))
        softmax_b = tf.Variable(tf.zeros(out_size))
    
    # Since output is a bunch of rows of RNN cell outputs, logits will be a bunch
    # of rows of logit outputs, one for each step and sequence
    logits = tf.matmul(x, softmax_w) + softmax_b
    
    # Use softmax to get the probabilities for predicted characters
    out = tf.nn.softmax(logits, name='prediction')
    
    return out, logits

# Training loss

In [63]:
def build_loss(logits, targets, lstm_size, num_classes):
    ''' Calculate the loss from the logits and the targets.
    
        Arguments
        ---------
        logits: Logits from final fully connected layer
        targets: Targets for supervised learning
        lstm_size: Number of LSTM hidden units
        num_classes: Number of classes in targets
        
    '''
    
    # One-hot encode targets and reshape to match logits, one row per sequence per step
    y_one_hot = tf.one_hot(targets, num_classes)
    print('\ny_one_hot ==>\n',y_one_hot)
    y_reshaped = tf.reshape(y_one_hot, shape=(logits.shape))
    print('\ny_reshaped ==>\n',y_reshaped)
    # Softmax cross entropy loss
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
    print('\nloss\n ==>',loss)
    loss = tf.reduce_mean(loss)
    print('\nloss_mean\n ==>',loss)
    return loss

# Optimizer

In [101]:
def build_optimizer(loss, learning_rate, grad_clip):
    ''' Build optmizer for training, using gradient clipping.
    
        Arguments:
        loss: Network loss
        learning_rate: Learning rate for optimizer
    
    '''
    
    # Optimizer for training, using gradient clipping to control exploding gradients
    
    # 只要Variable变量的 trainable = True(默认为True), 则tf.trainable_variables()会收集所有Variable类的实例(变量).
    tvars = tf.trainable_variables()
#     print('\ntf.gradients(loss, tvars)\n',tf.gradients(loss, tvars))
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
#     print('\ngrads\n',grads)
    train_op = tf.train.AdamOptimizer(learning_rate)
#     print('\ntrain_op\n',train_op)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    return optimizer

# Build the Network

In [85]:
class CharRNN:
    
    def __init__(self, num_classes, batch_size=64, num_steps=50, 
                       lstm_size=128, num_layers=2, learning_rate=0.001, 
                       grad_clip=5, sampling=False):
    
        # When we're using this network for sampling later, we'll be passing in
        # one character at a time, so providing an option for that
        if sampling == True:
            batch_size, num_steps = 1, 1
        else:
            batch_size, num_steps = batch_size, num_steps
            
        # Reset graph    
        tf.reset_default_graph()
        
        #build input placeholder
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps) 
        
        # Build the LSTM cell
        self.lstm, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)
        
        ### Run the data through the RNN layer
        # one-hot encode the input token
        x_one_hot = tf.one_hot(self.inputs, num_classes)
        self.x_one_hot = x_one_hot
        
        # Run each sequence step through RNN with tf.nn.dynamic_run
        outputs, state = tf.nn.dynamic_rnn(self.lstm, x_one_hot, initial_state=self.initial_state)
        self.final_state = state
        self.outputs = outputs
        
        # Get softmax prediction and logits
        self.prediction, self.logits = build_output(outputs, lstm_size, num_classes)
        
        # Loss and optimizer (with gradient clipping)
        self.loss =  build_loss(self.logits, self.targets, lstm_size, num_classes)
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)

# Hyperparameters

In [30]:
batch_size = 10
num_steps = 50
lstm_size = 128
num_layers = 2
learning_rate = 0.01
keep_prob = 0.5

# Time for training

In [110]:
epochs = 20
# Save every N iterations
save_every_n = 200

model = CharRNN(len(vocab), batch_size=batch_size, num_steps=num_steps,
                lstm_size=lstm_size, num_layers=num_layers, 
                learning_rate=learning_rate)

saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Use the line below to load a checkpoint and resume training
#     saver.restore(sess, 'checkpoints/i78200_l128.ckpt')
    counter = 0
    for e in range(epochs):
        # Train network
        new_state = sess.run(model.initial_state)
        loss = 0
        for x, y in get_batches(encoded, batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            batch_loss, new_state, _ = sess.run([model.loss, 
                                                 model.final_state, 
                                                 model.optimizer], 
                                                 feed_dict=feed)
            
            end = time.time()
            print('Epoch: {}/{}... '.format(e+1, epochs),
                  'Training Step: {}... '.format(counter),
                  'Training loss: {:.4f}... '.format(batch_loss),
                  '{:.4f} sec/batch'.format((end-start)))
        
            if (counter % save_every_n == 0):
                saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))
    
    saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))


 lstm_output ==>
 Tensor("rnn/transpose:0", shape=(10, 50, 128), dtype=float32)

 seq_output ==>
 Tensor("concat:0", shape=(10, 50, 128), dtype=float32)

 seq_output.reshape
 Tensor("Reshape:0", shape=(500, 128), dtype=float32)
y_one_hot ==> Tensor("one_hot_1:0", shape=(10, 50, 83), dtype=float32)

new_state ==>
 (LSTMStateTuple(c=array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32), h=array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)), LSTMStateTuple(c=array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  