Deep Learning
=============

Assignment 6
------------

After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        name = f.namelist()[0]
        data = tf.compat.as_str(f.read(name))
    return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
    if char in string.ascii_lowercase:
        return ord(char) - first_letter + 1
    elif char == ' ':
        return 0
    else:
        print('Unexpected character: %s' % char)
        return 0
    
def id2char(dictid):
    if dictid > 0:
        return chr(dictid + first_letter - 1)
    else:
        return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


Function to generate a training batch for the LSTM model.

In [6]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()
  
    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
        for b in range(self._batch_size):
            batch[b, char2id(self._text[self._cursor[b]])] = 1.0
            self._cursor[b] = (self._cursor[b] + 1) % self._text_size
        return batch
    
    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches

def characters(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, characters(b))]
    return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [7]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

In [8]:

num_neurons = 200
num_layers = 3

graph = tf.Graph()
with graph.as_default():
    
    dropout = tf.placeholder(tf.float32, name='dropout')

    cell = tf.nn.rnn_cell.LSTMCell(num_neurons)
    cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=dropout)
    cell = tf.nn.rnn_cell.MultiRNNCell([cell]*num_layers)

    max_length = 10

    data = tf.placeholder(tf.float32, [None, max_length, 27], name='data')
    output, state = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32)

    target_size = 27
    target = tf.placeholder(tf.float32, [None, max_length, target_size], name='target')
    out_size = int(target.get_shape()[2])

    class_w = tf.Variable(tf.truncated_normal([num_neurons, out_size], -0.1, 0.1))
    class_b = tf.Variable(tf.zeros([out_size]))

    output = tf.reshape(output, [-1, num_neurons])
    logits = tf.matmul(output, class_w) + class_b
    train_pred = tf.nn.softmax(logits)
#     train_pred = tf.reshape(train_pred, [-1, max_length, out_size])

    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
        logits=logits, labels=target))

    learn_rate = 0.1
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learn_rate)
    gradients, v = zip(*optimizer.compute_gradients(cross_entropy))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v))

In [9]:
t = train_batches.next()

In [10]:
def batch_to_data(batch):
    train_data = []
    labels = []
    batch = np.array(batch).swapaxes(0, 1)
    for seq in batch:
        train_data.append(seq[:-1])
        labels.append(seq[1:])
    return np.array(train_data), np.array(labels)

In [11]:
train_data_, labels_ = batch_to_data(t)

In [12]:
num_steps = 7001
summary_freq = 100
with tf.Session(graph=graph) as sess:
    tf.global_variables_initializer().run()
    
    print('initialized')
    mean_loss = 0
    for step in range(num_steps):
        batch = train_batches.next()
        train_data, train_labels = batch_to_data(batch)
        feed_dict = dict()
        feed_dict[data] = train_data
        feed_dict[target] = train_labels
        feed_dict[dropout] = 0.6
        _, loss, train_predictions = sess.run([optimizer, cross_entropy, train_pred],
                                             feed_dict=feed_dict)
        mean_loss += loss
        if step % summary_freq == 0:
            if step > 0:
                mean_loss = mean_loss / summary_freq
            print('Average loss at step %d: %f' % (step, mean_loss))
            mean_loss = 0
            
            labels = np.concatenate(list(batch)[1:])
#             print(labels.shape)
#             print(train_predictions.shape)
            print('Minibatch perplexity: %.2f' % float(np.exp(logprob(
                        train_predictions.reshape((num_unrollings*batch_size, 27)), labels))))

initialized
Average loss at step 0: 3.295295
Minibatch perplexity: 27.00
Average loss at step 100: 3.048484
Minibatch perplexity: 18.24


KeyboardInterrupt: 

Deep Learning
=============

Assignment 6
------------

After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [13]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [14]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [15]:
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        name = f.namelist()[0]
        data = tf.compat.as_str(f.read(name))
    return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


In [16]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [51]:
vocabulary_size = len(string.ascii_lowercase) + 1 + 1 # [a-z] + ' ' + '<EOS>'
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
    # Adding EOS to vocabulary
    if char == '.':
        return 27
    if char in string.ascii_lowercase:
        return ord(char) - first_letter + 1
    elif char == ' ':
        return 0
    else:
        print('Unexpected character: %s' % char)
        return 0
    
def id2char(dictid):
    if dictid == 27:
        return '.'
    if dictid > 0:
        return chr(dictid + first_letter - 1)
    else:
        return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'), char2id('.'))
print(id2char(1), id2char(26), id2char(0), id2char(27))

Unexpected character: ï
1 26 0 0 27
a z   .


Function to generate a training batch for the LSTM model.

In [295]:
batch_size=20
num_unrollings=5
letter_choices = np.arange(vocabulary_size-1)
class BatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()
  
    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
        for b in range(self._batch_size):
#             batch[b, char2id(self._text[self._cursor[b]])] = 1.0
            batch[b, np.random.choice(letter_choices)] = 1.0
#             self._cursor[b] = (self._cursor[b] + 1) % self._text_size
        return batch
    
    def _eos_batch(self):
        """Generate a single batch of <EOS>"""
        batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
        batch[:, 27] = 1.0
        return batch
    
    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = []
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches # + [self._eos_batch() ]

def characters(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, characters(b))]
    return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['khldh', 'lzhig', 'ucldb', 'itwqa', 'rfvpz', 'bobfx', 'kndia', 'iydbb', 'v dim', 'xuafy', 'sfuxl', 'ij zd', 'kzrnf', 'rjlig', 'nnjif', 'esclt', 'mzvtj', 'xdljr', ' ufri', 'crfbw']
['dfklb', ' tbjd', 'isuee', 'cgklu', 'drrfx', 'styyu', 'rtysw', 'wmryb', 'zoiou', 'mgyra', 'cezrq', 'a pld', 'pq xd', 'mnc k', 'gbzw ', 'ymupo', 'ncbb ', 'cmhin', ' vwlx', 'mrqfp']
['k']
['l']


In [296]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

In [297]:
def lstm_cell(i, o, state, ifco_iw, ifco_ow, ifco_b, num_nodes):
#     print('i\t', i.get_shape())
#     print('ifco_iw\t', ifco_iw.get_shape())
#     print('o\t',o.get_shape())
#     print('ifco_ow\t', ifco_ow.get_shape())
#     print('ifco_b\t', ifco_b.get_shape())
    ifco = tf.matmul(i, ifco_iw) + tf.matmul(o, ifco_ow) + ifco_b
#     print('ifco\t', ifco.get_shape())
#     print('state\t', state.get_shape())
    state = tf.sigmoid(ifco[:, 0:num_nodes]) * tf.tanh(ifco[:, 2*num_nodes:3*num_nodes]) + \
            tf.sigmoid(ifco[:, num_nodes:2*num_nodes]) * state
#     print('state\t', state.get_shape())
    return tf.sigmoid(ifco[:, 3*num_nodes:4*num_nodes]) * tf.tanh(state), state

In [298]:
def multi_lstm_cell(i, o, state, ifco_iw, ifco_ow, ifco_b, num_nodes):
    cellout = None
    cellstate = None
    outs, states = [], []
    for idx in range(len(num_nodes)):

        if idx == 0:
            cellout, cellstate = lstm_cell(i, o[idx], state[idx], ifco_iw[idx], ifco_ow[idx], ifco_b[idx],
                                           num_nodes[idx])
        else:
            cellout, cellstate = lstm_cell(cellout, o[idx], state[idx], ifco_iw[idx], ifco_ow[idx], ifco_b[idx],
                                          num_nodes[idx])
        outs.append(cellout)
        states.append(cellstate)
    return outs, states

In [317]:
encoder_neurons = 256
encoder_layers = 1

decoder_neurons = 256
decoder_layers = 1
input_max_length = num_unrollings
output_max_length = num_unrollings
target_size = vocabulary_size

graph = tf.Graph()
with graph.as_default():
    
    data = tf.placeholder(tf.float32, [batch_size, input_max_length, target_size], name='data')
    _eos_markers_np = np.zeros([batch_size, target_size], dtype=np.float32)
    _eos_markers_np[:, 27] = 1.0
    eos_markers = tf.constant(value=_eos_markers_np, dtype=tf.float32, name='EOSes')
    dropout = tf.placeholder(tf.float32, name='dropout')
    
    target = tf.placeholder(tf.float32, [batch_size, output_max_length, target_size], name='target')
    out_size = int(target.get_shape()[2])
#     cell = tf.nn.rnn_cell.LSTMCell(num_neurons)
#     cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=dropout)
#     cell = tf.nn.rnn_cell.MultiRNNCell([cell]*num_layers)
    #################################################################################
    # Defining the encoder LSTM RNN                                                 #
    #################################################################################
    input_sizes = [target_size] + [encoder_neurons]*(encoder_layers-1)
    enc_ifco_iws = [tf.Variable(tf.truncated_normal([insize, encoder_neurons*4], -0.07, 0.07))
             for insize in input_sizes]
    enc_ifco_ows = [tf.Variable(tf.truncated_normal([encoder_neurons, encoder_neurons*4], -0.07, 0.07))
             for insize in input_sizes]
    enc_ifco_bs = [tf.Variable(tf.ones([encoder_neurons*4])) for i in range(encoder_layers)]
    
    #################################################################################
    # Running the encoder LSTM RNN                                                  #
    #################################################################################
#     output, state = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32)
    saved_output = [tf.Variable(tf.zeros([batch_size, encoder_neurons]), trainable=False) for i in range(encoder_layers)]
    saved_state = [tf.Variable(tf.zeros([batch_size, encoder_neurons]), trainable=False) for i in range(encoder_layers)]
    output = saved_output
    state = saved_state
    
    enc_inputs = [data[:, i, :] for i in range(input_max_length)]  # feed the input vectors
    enc_outputs = []
    for idx, i in enumerate(enc_inputs):
        output, state = multi_lstm_cell(i, output, state, enc_ifco_iws, enc_ifco_ows, enc_ifco_bs, [encoder_neurons]*encoder_layers)
#         enc_outputs.append(output[-1])
        
#     print(state, '\n')
#     print(state[-1], '\n')
#     print(state[-1][0], '\n')
#     c = state[-1][0]
#     c.set_shape((batch_size, c.get_shape()[1]))
    c = state

    #################################################################################
    # Defining the decoder LSTM RNN                                                 #
    #################################################################################
    
    input_sizes = [target_size] + [decoder_neurons]*(decoder_layers-1)
    ifco_iws = [tf.Variable(tf.truncated_normal([insize, decoder_neurons*4], -0.07, 0.07))
             for insize in input_sizes]
    ifco_ows = [tf.Variable(tf.truncated_normal([decoder_neurons, decoder_neurons*4], -0.07, 0.07))
             for insize in input_sizes]
    ifco_bs = [tf.Variable(tf.ones([decoder_neurons*4])) for i in range(decoder_layers)]
    
    saved_output = [tf.Variable(tf.zeros([batch_size, decoder_neurons]), trainable=False) for i in range(decoder_layers)]
#     saved_state = [tf.Variable(tf.zeros([batch_size, decoder_neurons]), trainable=False) for i in range(decoder_layers)]
    output = saved_output
#     state = saved_state
#     print(c.get_shape())
#     print(tf.zeros(c.get_shape()))
#     zero_input = tf.Variable(tf.zeros(c.get_shape()), trainable=False)

    #################################################################################
    # Running the decoder LSTM RNN                                                  #
    #################################################################################

    dec_inputs = [eos_markers] + [target[:, i, :] for i in range(output_max_length-1)]  # feed the lagged goal vectors as inputs
    dec_outputs = []
    for idx, i in enumerate(dec_inputs):
        if idx == 0:
            output, state = multi_lstm_cell(i, output, c, ifco_iws, ifco_ows, ifco_bs, [decoder_neurons]*decoder_layers)
        else:
            output, state = multi_lstm_cell(i, output, state, ifco_iws, ifco_ows, ifco_bs, [decoder_neurons]*decoder_layers)
        dec_outputs.append(output[-1])
        
    #################################################################################
    # Classification step                                                           #
    #################################################################################

    class_w = tf.Variable(tf.truncated_normal([decoder_neurons, out_size], -0.07, 0.07))
#     class_b = tf.Variable(tf.truncated_normal([out_size], -0.1, 0.1))
    class_b = tf.Variable(tf.zeros([out_size], dtype=tf.float32))

    output = tf.reshape(dec_outputs, [-1, decoder_neurons])
    logits = tf.matmul(output, class_w) + class_b
    train_pred = tf.nn.softmax(logits)

    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
        logits=logits, labels=target))

    learn_rate = 0.1
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learn_rate)
    gradients, v = zip(*optimizer.compute_gradients(cross_entropy))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v))
#     optimizer = tf.train.AdamOptimizer().minimize(cross_entropy, )
#     optimizer.

In [318]:
t = train_batches.next()

In [319]:
def batch_to_data(batch):
    """Format the batch batch of sequences to a batch of (forward, backward) 
       sequence pairs"""
    train_data = []
    labels = []
    batch = np.array(batch).swapaxes(0, 1)
    for seq in batch:
        train_data.append(seq[:])
        labels.append(seq[::-1])
    return np.array(train_data), np.array(labels)

In [320]:
train_data_, labels_ = batch_to_data(t)

In [321]:
train_data_.shape

(20, 5, 28)

In [322]:
def seq2string(vec):
    return ''.join(characters(vec))
def batch2string(batch):
    return batches2string(batch)

In [323]:
seq2string(train_data_[0])

'wqpxu'

In [324]:
seq2string(labels_[0])

'uxpqw'

In [325]:
num_steps = 7001
summary_freq = 100
with tf.Session(graph=graph) as sess:
    tf.global_variables_initializer().run()
    
    print('initialized')
    mean_loss = 0
    for step in range(num_steps):
        batch = train_batches.next()
        train_data, train_labels = batch_to_data(batch)
        feed_dict = dict()
        feed_dict[data] = train_data
        feed_dict[target] = train_labels
        feed_dict[dropout] = 1.0
        _, loss, train_predictions, embedding = sess.run([optimizer, cross_entropy, train_pred, c],
                                             feed_dict=feed_dict)
        mean_loss += loss
        if step % summary_freq == 0:
            if step > 0:
                mean_loss = mean_loss / summary_freq
            print('Average loss at step %d: %f' % (step, mean_loss))
            mean_loss = 0
            
            labels = np.concatenate(list(batch)[::-1])

            print('Minibatch perplexity: %.2f' % float(np.exp(logprob(
                        train_predictions, labels))))
            inputs, labels = batch_to_data(batch)
            print(seq2string(inputs[0]))
            print(seq2string(labels[0]))
            train_predictions = train_predictions.reshape((batch_size, (num_unrollings), vocabulary_size))
            saved_train_predictions = train_predictions

            print(seq2string(train_predictions[0]))
#             print(embedding)
#             print()
#             print(class_w.eval())
#             print()
#             print(class_b.eval())
#             print()
#             print(train_predictions)
            
#             print(labels.shape)
#             print(train_predictions.shape)
            

initialized
Average loss at step 0: 3.359133
Minibatch perplexity: 28.97
lhc z
z chl
ppppp
Average loss at step 100: 3.321066
Minibatch perplexity: 27.14
vlpzm
mzplv
sssss
Average loss at step 200: 3.316106
Minibatch perplexity: 27.04
tfcia
aicft
aaaaa
Average loss at step 300: 3.312999
Minibatch perplexity: 27.26
euwxe
exwue
jjjjj
Average loss at step 400: 3.312339
Minibatch perplexity: 26.99
jknps
spnkj
ooooo
Average loss at step 500: 3.312395
Minibatch perplexity: 27.61
aflap
palfa
fffff
Average loss at step 600: 3.309717
Minibatch perplexity: 26.73
fevot
tovef
hhhhh
Average loss at step 700: 3.308902
Minibatch perplexity: 26.84
yeldl
ldley
uuuuu
Average loss at step 800: 3.308402
Minibatch perplexity: 27.29
gieto
oteig
rrrrr
Average loss at step 900: 3.308772
Minibatch perplexity: 27.09
vpjxd
dxjpv
xxxxx
Average loss at step 1000: 3.307965
Minibatch perplexity: 26.88
euucd
dcuue
sssss
Average loss at step 1100: 3.308100
Minibatch perplexity: 26.88
iy ht
th yi
qqqqq
Average loss at 