In [1]:
import tensorflow as tf
import numpy as np
import collections
import os
import argparse
import datetime as dt

from collections import Counter
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu

In [2]:
train_sentences = [line.strip() for line in open("LSTM/LSTM/simple-examples/data/ptb.train.txt").readlines()]
val_sentences = [line.strip() for line in open("LSTM/LSTM/simple-examples/data/ptb.valid.txt").readlines()]
test_sentences = [line.strip() for line in open("LSTM/LSTM/simple-examples/data/ptb.test.txt").readlines()]

train_sentences = [x for x in train_sentences if x] 
val_sentences = [x for x in val_sentences if x] 
test_sentences = [x for x in test_sentences if x] 
print(len(train_sentences))
print(len(val_sentences))
print(len(test_sentences))

print(train_sentences[0])
print(train_sentences[1])
print(train_sentences[2])

42068
3370
3761
aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter
pierre <unk> N years old will join the board as a nonexecutive director nov. N
mr. <unk> is chairman of <unk> n.v. the dutch publishing group


In [3]:
sentences = train_sentences

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]


In [4]:
for ind,sen in enumerate(sentences):
    for i in range(20):
        try:
            sen.remove("<")
            sen.remove("unk")
        except:
            pass
        

print(sentences[2])

['<SOS>', 'mr.', '>', 'is', 'chairman', 'of', '>', 'n.v.', 'the', 'dutch', 'publishing', 'group', '<EOS>']


In [5]:
vocabulary = ["<SOS>"] + ["a"] + ["b"] + ["c"] + ["d"] + ["e"] + ["f"] + \
["g"] + ["h"] + ["i"] + ["j"] + ["k"] + ["l"] + ["m"] + ["n"] + ["o"] + \
["p"] + ["q"] + ["r"] + ["s"] + ["t"] + ["u"] + ["v"] + ["w"] + \
["x"] + ["y"] + ["z"] + ["<EOW>"] + ["<EOS>"] + [">"] + ["-"] + ["."] + ["'"] + ["0"] + ["1"] + ["2"] + ["3"] + \
["4"] + ["5"] + ["6"] + ["7"] + ["8"] + ["9"] + ["&"] + ["<"] + ["$"] + ["#"] + ["/"] + [","] + ["|"] + \
["@"] + ["%"] + ["^"] + ["\\"] + ["*"] + ["("] + [")"] + ["{"] + ["}"] + [":"] + [";"] 


vocabulary_size = len(vocabulary)

token2index = {token:index for index,token in enumerate(vocabulary)}
index2token = {index:token for index,token in enumerate(vocabulary)}
one_hot_embeddings = np.eye(vocabulary_size)
print(token2index.get("z"))
print(index2token.get(1))
print(one_hot_embeddings[token2index.get("\\")])

26
a
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
  0.  0.  0.  0.  0.  0.  0.]


In [6]:
max_word_length = 0
maxid = 0
for i in range(len(sentences)):
    l = len(sentences[i])
    if l > max_word_length:
        maxid = i
        max_word_length = l
        

print(max_word_length) 
print(maxid)

84
29099


In [71]:
def convert_tensor(arg):
    return tf.convert_to_tensor(arg,dtype=tf.int32)

def embed_producer(sentences):
    max_char_len = 486
    s_tensor = np.empty((len(sentences),max_char_len,vocabulary_size))
    word_loc_all = np.zeros((len(sentences),max_word_length))
    for i in range(len(sentences)):
        s = sentences[i]
        embed = np.zeros((max_char_len,vocabulary_size))
        word_loc = np.zeros(max_word_length)
        prev = 0
        #print(i)
        for k in range(len(s)):
            w = s[k]
            #print(w)
            for id,token in enumerate(w):
                
                if (w == "<EOS>") | (w == "<SOS>") | (w == ">"):
                    break
                else:
                    #print(prev + id)
                    #print(token)
                    embed[prev + id,:] = np.squeeze(one_hot_embeddings[token2index.get(token)])
                
            if (w == "<EOS>") | (w == "<SOS>") | (w == ">"):
                word_loc[k] = id + 1
                #print(prev)
                embed[prev,:] = one_hot_embeddings[token2index.get(w)]
                prev = prev + id + 1 
                
            else: 
                prev = prev + id + 1
                word_loc[k] = id + 1 
                #print(prev)
                embed[prev,:] = one_hot_embeddings[token2index.get("<EOW>")]
                prev = prev + 1
                
            
        s_tensor[i,:,:] = embed
        
        
        #to get word end locations to retrieve hidden states later 
        word_loc_all[i,0] = word_loc[0]
        for j in range(1,len(s)):
            word_loc_all[i,j] = word_loc_all[i,j-1] + word_loc[j]
            
        
    return s_tensor,word_loc_all 
        

In [72]:
data,word_loc_all = embed_producer(sentences)


In [73]:
print(sentences[29099])
print(word_loc_all[29099])

['<SOS>', 'using', 'estimates', 'of', 'the', 'company', "'s", 'future', 'earnings', 'under', 'a', 'variety', 'of', 'scenarios', 'first', 'boston', 'estimated', 'ual', "'s", 'value', 'at', '$', 'n', 'to', '$', 'n', 'a', 'share', 'if', 'its', 'future', 'labor', 'costs', 'conform', 'to', 'wall', 'street', 'projections', '$', 'n', 'to', '$', 'n', 'if', 'the', 'company', 'reaches', 'a', 'settlement', 'with', 'pilots', 'similar', 'to', 'one', 'at', 'nwa', '$', 'n', 'to', '$', 'n', 'under', 'an', 'adverse', 'labor', 'settlement', 'and', '$', 'n', 'to', '$', 'n', 'under', 'a', 'pilot', 'contract', 'imposed', 'by', 'the', 'company', 'following', 'a', 'strike', '<EOS>']
[   1.    6.   15.   17.   20.   27.   29.   35.   43.   48.   49.   56.
   58.   67.   72.   78.   87.   90.   92.   97.   99.  100.  101.  103.
  104.  105.  106.  111.  113.  116.  122.  127.  132.  139.  141.  145.
  151.  162.  163.  164.  166.  167.  168.  170.  173.  180.  187.  188.
  198.  202.  208.  215.  217.  220.  2

In [9]:
max_char_len=486
eow_pos = np.zeros((len(sentences),max_char_len))

for i in range(len(sentences)):
    for j in range(max_word_length):
        eow_pos[i,int(word_loc_all[i,j])] = 1
        
print(word_loc_all[29099])
print(eow_pos[29099])

[  1.   7.   8.   9.  14.  17.  21.  25.  28.  33.  35.  36.  48.  56.  60.
  61.  62.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.]
[ 1.  1.  0.  0.  0.  0.  0.  1.  1.  1.  0.  0.  0.  0.  1.  0.  0.  1.
  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  1.  0.  0.  0.  0.  1.  0.  1.
  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  0.  1.  0.  0.  0.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

In [53]:
maxN = 0
maxid = 0
for i in range(len(word_loc_all)):
    if max(word_loc_all[i]) > maxN:
        maxN = max(word_loc_all[i])
        maxid = i
    

print(maxN)
print(maxid)

413.0
4607


In [66]:
#print(sentences[4607])
print(data[4607][432])

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.]


In [48]:
print(sentences[29099])

['<SOS>', 'using', 'estimates', 'of', 'the', 'company', "'s", 'future', 'earnings', 'under', 'a', 'variety', 'of', 'scenarios', 'first', 'boston', 'estimated', 'ual', "'s", 'value', 'at', '$', 'n', 'to', '$', 'n', 'a', 'share', 'if', 'its', 'future', 'labor', 'costs', 'conform', 'to', 'wall', 'street', 'projections', '$', 'n', 'to', '$', 'n', 'if', 'the', 'company', 'reaches', 'a', 'settlement', 'with', 'pilots', 'similar', 'to', 'one', 'at', 'nwa', '$', 'n', 'to', '$', 'n', 'under', 'an', 'adverse', 'labor', 'settlement', 'and', '$', 'n', 'to', '$', 'n', 'under', 'a', 'pilot', 'contract', 'imposed', 'by', 'the', 'company', 'following', 'a', 'strike', '<EOS>']


In [21]:
'''
network_architecture = dict(n_hidden_recog_1 = 500,
     n_hidden_recog_2 = 500,
     n_input = 20,
     n_z = 20)
     
def xavier_init(fan_in,fan_out,constant=1):
    low = -constant*np.sqrt(6.0/(fan_in+fan_out))
    high = constant*np.sqrt(6.0/(fan_in+fan_out))
    return tf.random_uniform((fan_in,fan_out),minval=low,maxval=high,dtype=tf.float32)
    
    
class VariationalAutoencoder(object):
    
    def __init__(self,network_architecture,transfer_fct=tf.nn.softplus):
        self.network_architecture=network_architecture
        self.transfer_fct=transfer_fct        
        self._create_network()
        
    def _create_network(self):
        
        #initialize weights and biases 
        network_weights = self._initialize_weights(**self.network_architecture)
        
    
    def _initialize_weights(self,n_hidden_recog_1,n_hidden_recog_2, n_input,n_z):
        all_weights = dict()
        all_weights["weights_recog"] = {
            'h1': tf.Variable(xavier_init(n_input,n_hidden_recog_1)),
            'h2': tf.Variable(xavier_init(n_hidden_recog_1,n_hidden_recog_2)),
            'out_mean' : tf.Variable(xavier_init(n_hidden_recog_2,n_z)),
            'out_log_sigma' : tf.Variable(xavier_init(n_hidden_recog_2,n_z))} 

        all_weights["biases_recog"]={
            'b1': tf.Variable(tf.zeros([n_hidden_recog_1],dtype=tf.float32)),
            'b2': tf.Variable(tf.zeros([n_hidden_recog_2],dtype=tf.float32)),
            'out_mean': tf.Variable(tf.zeros([n_z],dtype=tf.float32)),
            'out_log_sigma': tf.Variable(tf.zeros([n_z], dtype=tf.float32))}

        return all_weights
        
    def _recognition_network(self,weights,biases):

        layer_1 = self.transfer_fct(tf.add(tf.matmul(self.x,weights['h1']),biases['b1']))
        layer_2 = self.transfer_fct(tf.add(tf.matmul(layer_1,weights['h2']),biases['b2']))

        z_mean = tf.add(tf.matmul(layer_2,weights['out_mean']),biases['out_mean'])
        z_log_sigma_sq = tf.add(tf.matmul(layer_2,weights['out_log_sigma']),biases['out_log_sigma'])

        return (z_mean,z_log_sigma_sq)
        
'''

'\nnetwork_architecture = dict(n_hidden_recog_1 = 500,\n     n_hidden_recog_2 = 500,\n     n_input = 20,\n     n_z = 20)\n     \ndef xavier_init(fan_in,fan_out,constant=1):\n    low = -constant*np.sqrt(6.0/(fan_in+fan_out))\n    high = constant*np.sqrt(6.0/(fan_in+fan_out))\n    return tf.random_uniform((fan_in,fan_out),minval=low,maxval=high,dtype=tf.float32)\n    \n    \nclass VariationalAutoencoder(object):\n    \n    def __init__(self,network_architecture,transfer_fct=tf.nn.softplus):\n        self.network_architecture=network_architecture\n        self.transfer_fct=transfer_fct        \n        self._create_network()\n        \n    def _create_network(self):\n        \n        #initialize weights and biases \n        network_weights = self._initialize_weights(**self.network_architecture)\n        \n    \n    def _initialize_weights(self,n_hidden_recog_1,n_hidden_recog_2, n_input,n_z):\n        all_weights = dict()\n        all_weights["weights_recog"] = {\n            \'h1\': tf.V

In [242]:
tf.reset_default_graph()
batch_size = 52
input_size = 61
hidden_size=20

# our [486, 52, 61] tensor becomes [[52, 61], [52, 61], ...]
inputs = tf.placeholder(tf.float32,[batch_size,max_char_len,input_size])
inputs_t = tf.transpose(inputs,perm=[1, 0, 2])
_inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_char_len,name='char_array')
_inputs_ta = _inputs_ta.unstack(inputs_t) 

cell = tf.contrib.rnn.LSTMCell(hidden_size)
output_ta = tf.TensorArray(size=max_char_len, dtype=tf.float32,name='word_array')
mean_ta = tf.TensorArray(size=max_char_len, dtype=tf.float32,name='mean_array')
sigma_ta = tf.TensorArray(size=max_char_len, dtype=tf.float32,name='sigma_array')
word_pos = tf.placeholder(tf.float32,[batch_size,max_char_len])
word_pos = tf.convert_to_tensor(word_pos,dtype=tf.float32)

# create loop_fn for raw_rnn
def loop_fn(time, cell_output, cell_state, loop_state):
    emit_output = cell_output  # == None if time = 0

    if cell_output is None:  # time = 0
        next_cell_state = cell.zero_state(batch_size, tf.float32)
        sample_loop_state = output_ta
        mean_loop_state = mean_ta
        sigma_loop_state = sigma_ta
        next_loop_state = (sample_loop_state,mean_loop_state,sigma_loop_state)

    else:
        word_slice = tf.tile(word_pos[:,time],[20])
        word_slice = tf.reshape(word_slice,[20,52])
        word_slice = tf.transpose(word_slice,perm=[1,0])
        next_sampled_input =  tf.multiply(cell_output,word_slice)
        
        #reparametrization
        z_concat = tf.contrib.layers.fully_connected(next_sampled_input,2*hidden_size)
        z_mean = z_concat[:,:20]
        z_log_sigma_sq =  z_concat[:,20:40]
        eps = tf.random_normal((batch_size,hidden_size),0,1,dtype=tf.float32)
        z_sample = tf.add(z_mean,tf.multiply(tf.sqrt(tf.exp(z_log_sigma_sq)),eps))
        
        z_sample = tf.multiply(z_sample,word_slice)
        #z_mean = tf.multiply(z_mean,word_slice)
        #z_log_sigma_sq = tf.multiply(z_log_sigma_sq,word_slice)
        
        next_cell_state = z_sample
        sample_loop_state = loop_state[0].write(time - 1, next_cell_state)
        mean_loop_state = loop_state[1].write(time - 1, z_mean)
        sigma_loop_state = loop_state[2].write(time - 1, z_log_sigma_sq)
        next_loop_state = (sample_loop_state,mean_loop_state,sigma_loop_state)
        
        word_slice = tf.logical_not(tf.cast(word_slice,dtype=tf.bool))
        word_slice = tf.cast(word_slice,dtype=tf.float32)
        next_cell_state = next_cell_state + tf.multiply(cell_state[0],word_slice)
        next_cell_state = tf.contrib.rnn.LSTMStateTuple(next_cell_state,cell_output)

    elements_finished = (time >= max_char_len-1)
    next_input = _inputs_ta.read(time)

    return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state)

outputs_ta, final_state, word_state = tf.nn.raw_rnn(cell, loop_fn)
word_state_out = word_state[0].stack()
mean_state_out = word_state[1].stack()
sigma_state_out = word_state[2].stack()
outputs = outputs_ta.stack()


In [243]:
num_batches = len(data) // batch_size
input_size = vocabulary_size
batch_size = 52
max_char_len = 486
hidden_size   = 20
learning_rate = 0.01

init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run([init_op])
    for epoch in range(1):
        epoch_error = 0
        
        for bt in range(1):
            x = data[bt*batch_size:(bt+1)*batch_size]
            word_pos_batch = eow_pos[bt*batch_size:(bt+1)*batch_size]
            outputs,final_state,word_state,mean_state,sigma_state = sess.run([outputs, final_state, word_state_out,
                                                                   mean_state_out,sigma_state_out],
                                                       feed_dict={inputs:x,word_pos:word_pos_batch})
            
        

In [244]:
#print(word_state.shape)
#print(mean_state.shape)
#print(sigma_state.shape)
#word_state = word_state[1]
#print(word_state.shape)
for i in range(20):
    #print(word_state[0,:,i])
    #print(mean_state[2,:,i])
    print(sigma_state[0,:,i])
    
#print(sigma_state[1][8])

[ 0.02326339  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339
  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339
  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339
  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339
  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339
  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339
  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339
  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339  0.02326339
  0.02326339  0.02326339  0.02326339  0.02326339]
[ 0.01731425  0.01731425  0.01731425  0.01731425  0.01731425  0.01731425
  0.01731425  0.01731425  0.01731425  0.01731425  0.01731425  0.01731425
  0.01731425  0.01731425  0.01731425  0.01731425  0.01731425  0.01731425
  0.01731425  0.01731425  0.01731425  0.01731425  0.01731425  0.01731425
  0.01731425  0.01731425  0.01731425  0.01731425  0.01731425  0.01731425
 