In [1]:
import tensorflow as tf
import numpy as np
import collections
import os
import argparse
import datetime as dt

from collections import Counter
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu

In [2]:
train_sentences = [line.strip() for line in open("LSTM/LSTM/simple-examples/data/ptb.train.txt").readlines()]
val_sentences = [line.strip() for line in open("LSTM/LSTM/simple-examples/data/ptb.valid.txt").readlines()]
test_sentences = [line.strip() for line in open("LSTM/LSTM/simple-examples/data/ptb.test.txt").readlines()]

train_sentences = [x for x in train_sentences if x] 
val_sentences = [x for x in val_sentences if x] 
test_sentences = [x for x in test_sentences if x] 
print(len(train_sentences))
print(len(val_sentences))
print(len(test_sentences))

print(train_sentences[0])
print(train_sentences[1])
print(train_sentences[2])

42068
3370
3761
aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter
pierre <unk> N years old will join the board as a nonexecutive director nov. N
mr. <unk> is chairman of <unk> n.v. the dutch publishing group


In [3]:
sentences = train_sentences

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]


In [4]:
for ind,sen in enumerate(sentences):
    for i in range(20):
        try:
            sen.remove("<")
            sen.remove("unk")
        except:
            pass
        

print(sentences[2])

['<SOS>', 'mr.', '>', 'is', 'chairman', 'of', '>', 'n.v.', 'the', 'dutch', 'publishing', 'group', '<EOS>']


In [5]:
vocabulary = ["<SOS>"] + ["a"] + ["b"] + ["c"] + ["d"] + ["e"] + ["f"] + \
["g"] + ["h"] + ["i"] + ["j"] + ["k"] + ["l"] + ["m"] + ["n"] + ["o"] + \
["p"] + ["q"] + ["r"] + ["s"] + ["t"] + ["u"] + ["v"] + ["w"] + \
["x"] + ["y"] + ["z"] + ["<EOW>"] + ["<EOS>"] + [">"] + ["-"] + ["."] + ["'"] + ["0"] + ["1"] + ["2"] + ["3"] + \
["4"] + ["5"] + ["6"] + ["7"] + ["8"] + ["9"] + ["&"] + ["<"] + ["$"] + ["#"] + ["/"] + [","] + ["|"] + \
["@"] + ["%"] + ["^"] + ["\\"] + ["*"] + ["("] + [")"] + ["{"] + ["}"] + [":"] + [";"] 


vocabulary_size = len(vocabulary)

token2index = {token:index for index,token in enumerate(vocabulary)}
index2token = {index:token for index,token in enumerate(vocabulary)}
one_hot_embeddings = np.eye(vocabulary_size)
print(token2index.get("z"))
print(index2token.get(1))
print(one_hot_embeddings[token2index.get("\\")])

26
a
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
  0.  0.  0.  0.  0.  0.  0.]


In [6]:
max_word_length = 0
maxid = 0
for i in range(len(sentences)):
    l = len(sentences[i])
    if l > max_word_length:
        maxid = i
        max_word_length = l
        

print(max_word_length) 
print(maxid)

84
29099


In [7]:
def convert_tensor(arg):
    return tf.convert_to_tensor(arg,dtype=tf.int32)

def embed_producer(sentences):
    max_char_len = 486
    s_tensor = np.empty((len(sentences),max_char_len,vocabulary_size))
    word_loc_all = np.zeros((len(sentences),max_word_length))
    for i in range(len(sentences)):
        s = sentences[i]
        embed = np.zeros((max_char_len,vocabulary_size))
        word_loc = np.zeros(max_word_length)
        prev = 0
        #print(i)
        for k in range(len(s)):
            w = s[k]
            #print(w)
            for id,token in enumerate(w):
                
                if (w == "<EOS>") | (w == "<SOS>") | (w == ">"):
                    break
                else:
                    #print(prev + id)
                    #print(token)
                    embed[prev + id,:] = np.squeeze(one_hot_embeddings[token2index.get(token)])
                
            if (w == "<EOS>") | (w == "<SOS>") | (w == ">"):
                word_loc[k] = id + 1
                #print(prev)
                embed[prev,:] = one_hot_embeddings[token2index.get(w)]
                prev = prev + id + 1 
                
            else: 
                prev = prev + id + 1
                word_loc[k] = id + 1 
                #print(prev)
                embed[prev,:] = one_hot_embeddings[token2index.get("<EOW>")]
                prev = prev + 1
                
            
        s_tensor[i,:,:] = embed
        
        
        #to get word end locations to retrieve hidden states later 
        word_loc_all[i,0] = word_loc[0]
        for j in range(1,len(s)):
            word_loc_all[i,j] = word_loc_all[i,j-1] + word_loc[j]
            
        
    return s_tensor,word_loc_all 
        

In [8]:
data,word_loc_all = embed_producer(sentences)
print(data[0][1])
print(word_loc_all[29099])

[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.]
[   1.    6.   15.   17.   20.   27.   29.   35.   43.   48.   49.   56.
   58.   67.   72.   78.   87.   90.   92.   97.   99.  100.  101.  103.
  104.  105.  106.  111.  113.  116.  122.  127.  132.  139.  141.  145.
  151.  162.  163.  164.  166.  167.  168.  170.  173.  180.  187.  188.
  198.  202.  208.  215.  217.  220.  222.  225.  226.  227.  229.  230.
  231.  236.  238.  245.  250.  260.  263.  264.  265.  267.  268.  269.
  274.  275.  280.  288.  295.  297.  300.  307.  316.  317.  323.  324.]


In [79]:
eow_pos = np.zeros((len(sentences),max_char_len))

for i in range(len(sentences)):
    for j in range(max_word_length):
        eow_pos[i,int(word_loc_all[i,j])] = 1
        
print(word_loc_all[1])
print(eow_pos[1])

[  1.   7.   8.   9.  14.  17.  21.  25.  28.  33.  35.  36.  48.  56.  60.
  61.  62.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.]
[ 1.  1.  0.  0.  0.  0.  0.  1.  1.  1.  0.  0.  0.  0.  1.  0.  0.  1.
  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  1.  0.  0.  0.  0.  1.  0.  1.
  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  0.  1.  0.  0.  0.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

In [119]:
input_size = vocabulary_size
batch_size = 52
max_char_len = 486
hidden_size   = 20
learning_rate = 0.01

tf.reset_default_graph() 

def _get_raw_rnn_graph(inputs,word_pos):
    
    _inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_char_len)
    # our [486, 52, 61] tensor becomes [[52, 61], [52, 61], ...]
    _inputs_ta = _inputs_ta.unstack(inputs) 

    cell = tf.contrib.rnn.LSTMCell(hidden_size)

    output_ta = tf.TensorArray(size=max_word_length, dtype=tf.int32)
    
    # create loop_fn for raw_rnn
    def loop_fn(time, cell_output, cell_state, loop_state):
        emit_output = cell_output  # == None if time = 0
        
        print(cell_state)
        if cell_output is None:  # time = 0
            next_cell_state = cell.zero_state(batch_size, tf.float32)
            next_loop_state = output_ta
            
        else:
            
            next_sampled_input =  tf.multiply(cell_state,word_pos[:,time])
            #reparametrization
            #z_mean
            #z_var 
            eps = tf.random_normal((batch_size,hidden_size),0,1,dtype=tf.float32)
            z_sample = tf.add(z_mean,tf.multiply(tf.sqrt(tf.exp(z_log_sigma_sq)),eps))
            
            next_cell_state = z_sample
            next_loop_state = loop_state.write(time - 1, next_cell_state)
            next_cell_state = next_cell_state + tf.multiply(cell_state,(~word_pos[:,time]))
            
                
            
        elements_finished = (time >= max_char_len)
        finished = tf.reduce_all(elements_finished)
        
        next_input = tf.cond(finished,
                             lambda: tf.zeros([batch_size, input_size], dtype=tf.float32),                                                   
                             lambda: _inputs_ta.read(time))

        #print("hi")

        return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state)

    outputs_ta, final_state, word_state = tf.nn.raw_rnn(cell, loop_fn)
    outputs = outputs_ta.stack()
    return outputs, final_state, word_state

In [120]:
num_batches = len(data) // batch_size

for epoch in range(1):
    epoch_error = 0
    
    for bt in range(1):
        x = data[bt*batch_size:(bt+1)*batch_size]
        word_pos_batch = eow_pos[bt*batch_size:(bt+1)*batch_size]
        x = tf.transpose(x,perm=[1, 0, 2])
        #print(word_pos_batch[:,2])
        outputs, final_state,word_state = _get_raw_rnn_graph(x,word_pos_batch)
        

None
LSTMStateTuple(c=<tf.Tensor 'rnn/while/lstm_cell/add_1:0' shape=(52, 20) dtype=float32>, h=<tf.Tensor 'rnn/while/lstm_cell/mul_2:0' shape=(52, 20) dtype=float32>)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices