In [None]:
import tensorflow as tf
import numpy as np
import collections
import os
import argparse
import datetime as dt

from collections import Counter
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu

In [3]:
train_sentences = [line.strip() for line in open("LSTM/LSTM/simple-examples/data/ptb.train.txt").readlines()]
val_sentences = [line.strip() for line in open("LSTM/LSTM/simple-examples/data/ptb.valid.txt").readlines()]
test_sentences = [line.strip() for line in open("LSTM/LSTM/simple-examples/data/ptb.test.txt").readlines()]

train_sentences = [x for x in train_sentences if x] 
val_sentences = [x for x in val_sentences if x] 
test_sentences = [x for x in test_sentences if x] 
print(len(train_sentences))
print(len(val_sentences))
print(len(test_sentences))

print(train_sentences[0])
print(train_sentences[1])
print(train_sentences[2])

42068
3370
3761
aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter
pierre <unk> N years old will join the board as a nonexecutive director nov. N
mr. <unk> is chairman of <unk> n.v. the dutch publishing group


In [4]:
sentences = train_sentences
# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]
print("done")

In [5]:
for ind,sen in enumerate(sentences):
    for i in range(20):
        try:
            sen.remove("<")
            sen.remove("unk")
        except:
            pass
        
print(sentences[2])

['<SOS>', 'mr.', '>', 'is', 'chairman', 'of', '>', 'n.v.', 'the', 'dutch', 'publishing', 'group', '<EOS>']


In [6]:
vocabulary = ["<SOS>"] + ["a"] + ["b"] + ["c"] + ["d"] + ["e"] + ["f"] + \
["g"] + ["h"] + ["i"] + ["j"] + ["k"] + ["l"] + ["m"] + ["n"] + ["o"] + \
["p"] + ["q"] + ["r"] + ["s"] + ["t"] + ["u"] + ["v"] + ["w"] + \
["x"] + ["y"] + ["z"] + ["<EOW>"] + ["<EOS>"] + [">"] + ["-"] + ["."] + ["'"] + ["0"] + ["1"] + ["2"] + ["3"] + \
["4"] + ["5"] + ["6"] + ["7"] + ["8"] + ["9"] + ["&"] + ["<"] + ["$"] + ["#"] + ["/"] + [","] + ["|"] + \
["@"] + ["%"] + ["^"] + ["\\"] + ["*"] + ["("] + [")"] + ["{"] + ["}"] + [":"] + [";"] 

vocabulary_size = len(vocabulary)

token2index = {token:index for index,token in enumerate(vocabulary)}
index2token = {index:token for index,token in enumerate(vocabulary)}
one_hot_embeddings = np.eye(vocabulary_size)
print(token2index.get("z"))
print(index2token.get(1))
print(one_hot_embeddings[token2index.get("\\")])

26
a
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
  0.  0.  0.  0.  0.  0.  0.]


In [7]:
max_word_length = 0
maxid = 0
for i in range(len(sentences)):
    l = len(sentences[i])
    if l > max_word_length:
        maxid = i
        max_word_length = l
        
print(max_word_length) 
print(maxid)

84
29099


In [20]:
def convert_tensor(arg):
    return tf.convert_to_tensor(arg,dtype=tf.int32)

def embed_producer(sentences):
    max_char_len = 494
    s_tensor = np.empty((len(sentences),max_char_len,vocabulary_size))
    word_loc_all = np.zeros((len(sentences),max_word_length))
    eow_loc_all = np.zeros((len(sentences),max_char_len))
    
    for i in range(len(sentences)):
        s = sentences[i]
        embed = np.zeros((max_char_len,vocabulary_size))
        word_loc = np.zeros(max_word_length)
        eow_loc = np.zeros(max_char_len)
        prev = 0
        count = 0 
        #print(i)
        for k in range(len(s)):
            w = s[k]
            #print(w)
            for id,token in enumerate(w):
                
                if (w == "<EOS>") | (w == "<SOS>") | (w == ">"):
                    break
                else:
                    #print(prev + id)
                    #print(token)
                    count+=1
                    embed[prev + id,:] = np.squeeze(one_hot_embeddings[token2index.get(token)])
                
            if (w == "<EOS>") | (w == "<SOS>"):
                word_loc[k] = id + 1
                #print(prev)
                embed[prev,:] = one_hot_embeddings[token2index.get(w)]
                count +=1
                eow_loc[count] = 1
                prev = prev + id + 1 
                
            elif (w == ">"):
                word_loc[k] = id + 1
                count +=1
                embed[prev,:] = one_hot_embeddings[token2index.get(w)]
                prev = prev + id + 1 
                embed[prev,:] = one_hot_embeddings[token2index.get("<EOW>")]
                count +=1
                eow_loc[count] = 1
                prev = prev + 1
                
            else: 
                prev = prev + id + 1
                word_loc[k] = id + 1 
                #print(prev)
                embed[prev,:] = one_hot_embeddings[token2index.get("<EOW>")]
                count +=1 
                eow_loc[count] = 1
                prev = prev + 1
                
            
        s_tensor[i,:,:] = embed
        eow_loc_all[i,:] = eow_loc
        
        
        #to get word end locations to retrieve hidden states later 
        word_loc_all[i,0] = word_loc[0]
        for j in range(1,len(s)):
            word_loc_all[i,j] = word_loc_all[i,j-1] + word_loc[j]
            
        
    return s_tensor,word_loc_all,eow_loc_all 
        

In [21]:
data,word_loc_all,eow_loc_all = embed_producer(sentences)
print("done")

done


In [22]:
print(sentences[4607])
print(word_loc_all[4607])
print(eow_loc_all[4607])

['<SOS>', 'downgraded', 'by', 'moody', "'s", 'were', 'houston', 'lighting', "'s", '>', 'bonds', 'and', 'secured', '>', 'bonds', 'to', 'single-a-3', 'from', 'single-a-2', 'unsecured', '>', 'bonds', 'to', '>', 'from', 'single-a-3', 'preferred', 'stock', 'to', 'single-a-3', 'from', 'single-a-2', 'a', 'shelf', 'registration', 'for', 'preferred', 'stock', 'to', 'a', 'preliminary', 'rating', 'of', 'single-a-3', 'from', 'a', 'preliminary', 'rating', 'of', 'single-a-2', 'two', 'shelf', '>', 'for', 'collateralized', 'debt', 'securities', 'to', 'a', 'preliminary', 'rating', 'of', 'single-a-3', 'from', 'a', 'preliminary', 'rating', 'of', 'single-a-2', 'and', 'the', 'unit', "'s", 'rating', 'for', 'commercial', 'paper', 'to', '>', 'from', '>', '<EOS>']
[   1.   11.   13.   18.   20.   24.   31.   39.   41.   42.   47.   50.
   57.   58.   63.   65.   75.   79.   89.   98.   99.  104.  106.  107.
  111.  121.  130.  135.  137.  147.  151.  161.  162.  167.  179.  182.
  191.  196.  198.  199.  210. 

In [None]:
'''
max_char_len=494
eow_pos = np.zeros((len(sentences),max_char_len))

for i in range(len(sentences)):
    for j in range(max_word_length):
        eow_pos[i,int(word_loc_all[i,j])] = 1
        
print(word_loc_all[29099])
print(eow_pos[29099])

maxN = 0
maxid = 0
for i in range(len(word_loc_all)):
    if max(word_loc_all[i]) > maxN:
        maxN = max(word_loc_all[i])
        print(maxN)
        maxid = i
    
print(maxN)
print(maxid)
'''

In [63]:
tf.reset_default_graph()
max_char_len = 494
batch_size = 52
input_size = 61
hidden_size=20

# our [486, 52, 61] tensor becomes [[52, 61], [52, 61], ...]
inputs = tf.placeholder(tf.float32,[batch_size,max_char_len,input_size])
inputs_t = tf.transpose(inputs,perm=[1, 0, 2])
_inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_char_len,name='char_array')
_inputs_ta = _inputs_ta.unstack(inputs_t) 

cell = tf.contrib.rnn.LSTMCell(hidden_size)
output_ta = tf.TensorArray(size=max_char_len, dtype=tf.float32,name='word_array')
mean_ta = tf.TensorArray(size=max_char_len, dtype=tf.float32,name='mean_array')
sigma_ta = tf.TensorArray(size=max_char_len, dtype=tf.float32,name='sigma_array')
word_pos = tf.placeholder(tf.float32,[batch_size,max_char_len])
word_pos = tf.convert_to_tensor(word_pos,dtype=tf.float32)

# create loop_fn for raw_rnn
def loop_fn(time, cell_output, cell_state, loop_state):
    emit_output = cell_output  # == None if time = 0

    if cell_output is None:  # time = 0
        next_cell_state = cell.zero_state(batch_size, tf.float32)
        sample_loop_state = output_ta
        mean_loop_state = mean_ta
        sigma_loop_state = sigma_ta
        next_loop_state = (sample_loop_state,mean_loop_state,sigma_loop_state)

    else:
        word_slice = tf.tile(word_pos[:,time-1],[20])
        word_slice = tf.reshape(word_slice,[20,52])
        word_slice = tf.transpose(word_slice,perm=[1,0])
        next_sampled_input =  tf.multiply(cell_output,word_slice)
        
        #reparametrization
        z_concat = tf.contrib.layers.fully_connected(next_sampled_input,2*hidden_size)
        z_mean = z_concat[:,:20]
        z_log_sigma_sq =  z_concat[:,20:40]
        eps = tf.random_normal((batch_size,hidden_size),0,1,dtype=tf.float32)
        z_sample = tf.add(z_mean,tf.multiply(tf.sqrt(tf.exp(z_log_sigma_sq)),eps))
        
        z_sample = tf.multiply(z_sample,word_slice)
        z_mean = tf.multiply(z_mean,word_slice)
        z_log_sigma_sq = tf.multiply(z_log_sigma_sq,word_slice)
        
        next_cell_state = z_sample
        sample_loop_state = loop_state[0].write(time - 1, next_cell_state)
        mean_loop_state = loop_state[1].write(time - 1, z_mean)
        sigma_loop_state = loop_state[2].write(time - 1, z_log_sigma_sq)
        next_loop_state = (sample_loop_state,mean_loop_state,sigma_loop_state)
        
        word_slice = tf.logical_not(tf.cast(word_slice,dtype=tf.bool))
        word_slice = tf.cast(word_slice,dtype=tf.float32)
        next_cell_state = next_cell_state + tf.multiply(cell_state[0],word_slice)
        next_cell_state = tf.contrib.rnn.LSTMStateTuple(next_cell_state,cell_output)

    elements_finished = (time >= max_char_len-1)
    next_input = _inputs_ta.read(time)

    return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state)

outputs_ta, final_state, word_state = tf.nn.raw_rnn(cell, loop_fn)
word_state_out = word_state[0].stack()
mean_state_out = word_state[1].stack()
sigma_state_out = word_state[2].stack()
outputs = outputs_ta.stack()


In [64]:
num_batches = len(data) // batch_size
input_size = vocabulary_size
batch_size = 52
max_char_len = 494
hidden_size   = 20
learning_rate = 0.01

init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run([init_op])
    for epoch in range(1):
        epoch_error = 0
        
        for bt in range(1):
            x = data[bt*batch_size:(bt+1)*batch_size]
            word_pos_batch = eow_loc_all[bt*batch_size:(bt+1)*batch_size]
            outputs,final_state,word_state,mean_state,sigma_state = sess.run([outputs, final_state, word_state_out,
                                                                   mean_state_out,sigma_state_out],
                                                       feed_dict={inputs:x,word_pos:word_pos_batch})
            
        

In [60]:
#print(word_state.shape)
#print(mean_state.shape)
#print(sigma_state.shape)
#word_state = word_state[1]
#print(word_state.shape)
for i in range(20):
    #print(word_state[6,:,i])
    #print(mean_state[2,:,i])
    #print(sigma_state[3,:,i])
    
#print(sigma_state[1][8])

[ 0.          0.          0.          0.          0.13253187  0.          0.
  0.          0.17314994  0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
[ 0.          0.          0.          0.          0.03053938  0.
  0.14000218  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          