## LSTM (character-wise) Language modeling  

##### Author: Wangjin Lee, Seoul National University. jinsamdol@snu.ac.kr  
http://nlpway.wordpress.com  

##### References  
* Tomas Mikolov et al. "Recurrent neural network based language model". INTERSPEECH 2010.  
* Martin Sundermeyer, Ralf Schluter, and Hermann Ney. "LSTM Neural networks for language modeling"  
* Andrej Karpathy. "The unreasonable effectivenss of recurrent neural networks". 2015.  
* Andrej Karpathy. char-rnn. https://github.com/karpathy/char-rnn#tips-and-tricks  
  
This code is a simple LSTM language model predicting following three characters given character sequence.  
Vocabulary is a set of alphabets, and they are used in the form of one-hot encoded sequence in the graph.  
  
Some notables points are as following:  
* the source and target (x , y) have to have same length.  
* the logit is calculated based on cross-entropy for measuring the difference between the prediction and the real target value.  
* gradient clipping was used in order to prevent gradient exploding.  

In [62]:
import tensorflow as tf

In [63]:
source = ['abcde', 'bcdef', 'cdefg', 'defgh', 'efghi', 'fghij', 'ghijk', 'hijkl', 'ijklm', 'jklmn', 'klmno', 'lmnop', 'mnopq', 'nopqr']

In [64]:
voca = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r']

In [66]:
vocab_to_int = {c: i for i, c in enumerate(voca)}
int_to_vocab = {}
for v, i in vocab_to_int.items():
    #print(i, v)
    int_to_vocab[i] = v

In [67]:
import numpy as np
encoded_source = []
for s in source:    
    tmp = [vocab_to_int[i] for i in s]
    encoded_source.append(tmp)
    
print(encoded_source)
encoded_source = np.array(encoded_source, dtype=np.int32)
print(encoded_source)

[[0, 1, 2, 3, 4], [1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7], [4, 5, 6, 7, 8], [5, 6, 7, 8, 9], [6, 7, 8, 9, 10], [7, 8, 9, 10, 11], [8, 9, 10, 11, 12], [9, 10, 11, 12, 13], [10, 11, 12, 13, 14], [11, 12, 13, 14, 15], [12, 13, 14, 15, 16], [13, 14, 15, 16, 17]]
[[ 0  1  2  3  4]
 [ 1  2  3  4  5]
 [ 2  3  4  5  6]
 [ 3  4  5  6  7]
 [ 4  5  6  7  8]
 [ 5  6  7  8  9]
 [ 6  7  8  9 10]
 [ 7  8  9 10 11]
 [ 8  9 10 11 12]
 [ 9 10 11 12 13]
 [10 11 12 13 14]
 [11 12 13 14 15]
 [12 13 14 15 16]
 [13 14 15 16 17]]


In [68]:
rnn_size = 20
kp = 0.6
batch_size = 1
grad_clip = 2

In [69]:
tf.reset_default_graph()
graph = tf.Graph()
with graph.as_default():
    x = tf.placeholder(tf.int32, [batch_size, None], name='input')
    y = tf.placeholder(tf.int32, [batch_size, None], name='output')
    
    x_one_hot = tf.one_hot(x, len(voca))
    
    def build_cell(rnn_size, kp):
        lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
        return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=kp)
    
    #hidden layer
    with tf.variable_scope('lstm_cell'):
        cell = tf.contrib.rnn.MultiRNNCell([build_cell(rnn_size, kp) for _ in range(2)])
        initial_state = cell.zero_state(batch_size, tf.float32)
        
        #output of the hidden layer
        outputs, final_state = tf.nn.dynamic_rnn(cell, x_one_hot, initial_state=initial_state)

    #The output is a bunch of rows, one row for each step for each sequence. 
    #Concatenate lstm output over axis 1 (the columns)
    seq_output = tf.concat(outputs, axis=1)
    seq_output = tf.reshape(seq_output, [-1, rnn_size])
    
    #prediction
    with tf.variable_scope('prediction'):        
        softmax_w = tf.Variable(tf.truncated_normal((rnn_size, len(voca)), stddev=0.1))
        softmax_b = tf.Variable(tf.zeros(len(voca)))
        
        logits = tf.matmul(seq_output, softmax_w)+softmax_b
        prediction = tf.nn.softmax(logits, name='my_softmax')
        
    #loss
    with tf.variable_scope('loss'):
        y_one_hot = tf.one_hot(y, len(voca))
        y_reshape = tf.reshape(y_one_hot, tf.shape(logits)) #one row per sequence per step.
        #tf.shape(logits) error x , logits.get_shape() error o
        
        loss = tf.reduce_mean ( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshape) )
        #loss = tf.reduce_mean(loss)
        
    #op
    with tf.variable_scope('optimizer'):
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001).apply_gradients(zip(grads, tvars))
        
        

In [72]:
print_step = 100
with tf.Session(graph = graph) as sess:
    sess.run(tf.global_variables_initializer())
    for e in range(2001):
        tl = []
        for single_source in encoded_source:
            source = [single_source[:3]]
            target = [single_source[2:]]
            #print(source)
            #print(target)
            train_loss = sess.run([loss, optimizer], feed_dict={x:source, y:target})
            
            tl.append(train_loss[0])
        if e % print_step==0:
            print('Epoch: {}, loss: {}'.format(e, str(np.mean(tl))))
        
    saver = tf.train.Saver()
    saver.save(sess, './lstm-LM')
            

Epoch: 0, loss: 2.8926013
Epoch: 100, loss: 0.99215025
Epoch: 200, loss: 0.5175403
Epoch: 300, loss: 0.25197208
Epoch: 400, loss: 0.22179493
Epoch: 500, loss: 0.1121848
Epoch: 600, loss: 0.05812817
Epoch: 700, loss: 0.050513845
Epoch: 800, loss: 0.0913099
Epoch: 900, loss: 0.07065218
Epoch: 1000, loss: 0.03791935
Epoch: 1100, loss: 0.05409404
Epoch: 1200, loss: 0.06788659
Epoch: 1300, loss: 0.110768475
Epoch: 1400, loss: 0.07385433
Epoch: 1500, loss: 0.08666229
Epoch: 1600, loss: 0.03506421
Epoch: 1700, loss: 0.10255342
Epoch: 1800, loss: 0.04191437
Epoch: 1900, loss: 0.045337975
Epoch: 2000, loss: 0.043649424


In [74]:
tf.reset_default_graph()
restored_graph = tf.Graph()
with restored_graph.as_default():
    with tf.Session(graph = restored_graph) as sess:

        loader = tf.train.import_meta_graph('./lstm-LM.meta')
        loader.restore(sess, './lstm-LM')
        
        '''
        for v in restored_graph.get_operations():
            print(v)
        '''
        x = restored_graph.get_tensor_by_name('input:0')
        my_prediction = restored_graph.get_tensor_by_name('prediction/my_softmax:0')
        
        for single_source in encoded_source:
            source = [single_source[:3]]
            out = sess.run([my_prediction], feed_dict={x:source})
            _in = "input: "
            for s in source[0]:
                _in += int_to_vocab[s]
                
            print(_in)
            
            ostr = "output: "
            for _o in out[0]:
                #print(_o)
                i = np.argmax(_o)
                #print(i)
                #print(int_to_vocab[i])
                ostr+=str(int_to_vocab[i])
            print(ostr)
                
        

INFO:tensorflow:Restoring parameters from ./lstm-LM
input: abc
output: cde
input: bcd
output: def
input: cde
output: efg
input: def
output: fgh
input: efg
output: ghi
input: fgh
output: hij
input: ghi
output: ijk
input: hij
output: jkl
input: ijk
output: klm
input: jkl
output: lmn
input: klm
output: mno
input: lmn
output: nop
input: mno
output: opq
input: nop
output: pqr
