# REDREAMER
This project generates novel dreams based on a corpus of dreams.

In [246]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import helper

data_dir = './data/dreams1.txt'
text = helper.load_data(data_dir)
# Ignore notice, since we don't use it for analysing the data
text = text[0:]

## Explore the Data
Play around with `view_sentence_range` to view different parts of the data.

In [248]:
view_sentence_range = (1, 50)

"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import numpy as np

print('Dataset Stats')
embedding_dim = len({word: None for word in text.split()})
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))
scenes = text.split('\n\n')
print('Number of scenes: {}'.format(len(scenes)))
sentence_count_scene = [scene.count('\n') for scene in scenes]
print('Average number of sentences in each scene: {}'.format(np.average(sentence_count_scene)))

sentences = [sentence for scene in scenes for sentence in scene.split('\n')]
print('Number of lines: {}'.format(len(sentences)))
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

print()
print('The sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 5309
Number of scenes: 254
Average number of sentences in each scene: 1.8425196850393701
Number of lines: 722
Average number of words in each line: 38.692520775623265

The sentences 1 to 50:
Dream of being on the food game show Chopped


The basket ingredients included lettuce and some kind of oyster or clam. 


After the first round I fell asleep in the kitchen while waiting for rice to cook


And failed to create any kind of dish 


But my meal as a whole was called The Canon


The judges said that The Canon lacked cohesion and they had to chop me


________________
Dream


There are two young girls that are related somehow, but separated by a world that’s warring. There are two major countries or tribes. One is something like Asgaard(ians?) while the other is more technologically advanced or interested? 


War scene where the first group uses giant (dinosaur-like?) animals and the latter uses war machines that look futuristic, almost

## Implement Preprocessing Functions
The first thing to do to any dataset is preprocessing.  Implement the following preprocessing functions below:
- Lookup Table
- Tokenize Punctuation

### Lookup Table
To create a word embedding, you first need to transform the words to ids.  In this function, create two dictionaries:
- Dictionary to go from the words to an id, we'll call `vocab_to_int`
- Dictionary to go from the id to word, we'll call `int_to_vocab`

Return these dictionaries in the following tuple `(vocab_to_int, int_to_vocab)`

In [249]:
import numpy as np
import problem_unittests as tests
from collections import Counter
import tensorflow as tf

def create_lookup_tables(text):
    vocab_to_int = {}
    int_to_vocab = {}
    for i, word in enumerate(set(text)):
        vocab_to_int[word] = i
        int_to_vocab[i] = word
    return vocab_to_int, int_to_vocab
        

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_create_lookup_tables(create_lookup_tables)

Tests Passed


### Tokenize Punctuation
Punctuation makes it difficult for the neural network to deal with situations like "bye" and "bye!" so we tokenize punctuation separately from words.

Implement the function `token_lookup` to return a dict that will be used to tokenize symbols like "!" into "||Exclamation_Mark||".  Create a dictionary for the following symbols where the symbol is the key and value is the token:
- Period ( . )
- Comma ( , )
- Quotation Mark ( " )
- Semicolon ( ; )
- Exclamation mark ( ! )
- Question mark ( ? )
- Left Parentheses ( ( )
- Right Parentheses ( ) )
- Dash ( -- )
- Return ( \n )

This dictionary will be used to token the symbols and add the delimiter (space) around it.  This separates the symbols as it's own word, making it easier for the neural network to predict on the next word. Make sure you don't use a token that could be confused as a word. Instead of using the token "dash", try using something like "||dash||".

In [250]:

def token_lookup():
    token_dict = {".":"||period||", ",":"||comma||", ";":"||semicolon||", "?":"||questionmark||", "(":"||leftparen||", ")":"||rightparen||", "--":"||dash||", "\n":"||return||", '!':'||exclam||', '"':'||quot||'}
    # TODO: Implement Function
    return token_dict

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_tokenize(token_lookup)

Tests Passed


## Preprocess all the data and save it
Running the code cell below will preprocess all the data and save it to file.

In [251]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
# Preprocess Training, Validation, and Testing Data
helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

# Check Point
This is your first checkpoint. If you ever decide to come back to this notebook or have to restart the notebook, you can start from here. The preprocessed data has been saved to disk.

In [252]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import helper
import numpy as np
import problem_unittests as tests

int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

## Build the Neural Network

### Check the Version of TensorFlow and Access to GPU

In [253]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('This network will train very slowly without a GPU.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.0.1
Default GPU Device: /gpu:0


### Input


In [254]:
def get_inputs():
    input = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate') 
    
    return input, targets, learning_rate


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_get_inputs(get_inputs)

Tests Passed


### Build RNN Cell and Initialize


In [255]:
def get_init_cell(batch_size, rnn_size):

    num_layers = 2
    cell = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    #cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=0.75)
    cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers)
    initial_state = cell.zero_state(batch_size, tf.float32)
    initial_state = tf.identity(initial_state, name="initial_state")
    
    return cell, initial_state


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_get_init_cell(get_init_cell)

Tests Passed


### Word Embedding


In [256]:
def get_embed(input_data, vocab_size, embed_dim):
    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_dim), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, input_data)
    return embed


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_get_embed(get_embed)

Tests Passed


### Build RNN


In [257]:
def build_rnn(cell, inputs):
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    final_state = tf.identity(final_state, name="final_state")
    return outputs, final_state


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_build_rnn(build_rnn)

Tests Passed


### Build the Neural Network


In [258]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    embed = get_embed(input_data, vocab_size, rnn_size)
    output, final_state = build_rnn(cell, embed)
    logits = tf.contrib.layers.fully_connected(output, vocab_size, activation_fn=None)
    return logits, final_state


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_build_nn(build_nn)

Tests Passed


### Batching


In [259]:
def get_batches(int_text, batch_size, seq_length):

    # TODO: Implement Function
    n_batches = int(len(int_text) / (batch_size*seq_length))
    xdata = np.array(int_text[:n_batches*batch_size*seq_length])
    ydata = np.roll(xdata,-1)
    xbatches = np.split(xdata.reshape(batch_size, -1), n_batches, 1)
    ybatches = np.split(ydata.reshape(batch_size, -1), n_batches, 1)
    
    return np.array(list(zip(xbatches, ybatches)))
        




"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_get_batches(get_batches)

Tests Passed


## Neural Network Training
### Hyperparameters
Tune the following parameters:

- Set `num_epochs` to the number of epochs.
- Set `batch_size` to the batch size.
- Set `rnn_size` to the size of the RNNs.
- Set `embed_dim` to the size of the embedding. This should equal the number of unique words in the corpus.
- Set `seq_length` to the length of sequence. This should match the length of the average sentence.
- Set `learning_rate` to the learning rate.
- Set `show_every_n_batches` to the number of batches the neural network should print progress.

In [275]:
# Number of Epochs
num_epochs = 300
# Batch Size
batch_size = 64
# RNN Size
rnn_size = 512

# Embedding Dimension Size
embed_dim = 5661
# Sequence Length
seq_length = 32
# Learning Rate
learning_rate = 0.001
# Show stats for every n number of batches
show_every_n_batches = 20

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
save_dir = './save-dreams'

### Build the Graph
Build the graph using the neural network you implemented.

In [276]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)

    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')

    # Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))

    # Optimizer
    optimizer = tf.train.RMSPropOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

## Train
Train the neural network on the preprocessed data.  If you have a hard time getting a good loss, check the [forums](https://discussions.udacity.com/) to see if anyone is having the same problem.

In [277]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
batches = get_batches(int_text, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   0 Batch    0/15   train_loss = 8.231
Epoch   1 Batch    5/15   train_loss = 8.229
Epoch   2 Batch   10/15   train_loss = 8.228
Epoch   4 Batch    0/15   train_loss = 8.221
Epoch   5 Batch    5/15   train_loss = 8.205
Epoch   6 Batch   10/15   train_loss = 8.148
Epoch   8 Batch    0/15   train_loss = 6.628
Epoch   9 Batch    5/15   train_loss = 6.185
Epoch  10 Batch   10/15   train_loss = 6.070
Epoch  12 Batch    0/15   train_loss = 5.952
Epoch  13 Batch    5/15   train_loss = 5.744
Epoch  14 Batch   10/15   train_loss = 5.385
Epoch  16 Batch    0/15   train_loss = 5.099
Epoch  17 Batch    5/15   train_loss = 4.876
Epoch  18 Batch   10/15   train_loss = 4.684
Epoch  20 Batch    0/15   train_loss = 4.532
Epoch  21 Batch    5/15   train_loss = 4.398
Epoch  22 Batch   10/15   train_loss = 4.186
Epoch  24 Batch    0/15   train_loss = 4.037
Epoch  25 Batch    5/15   train_loss = 3.875
Epoch  26 Batch   10/15   train_loss = 3.769
Epoch  28 Batch    0/15   train_loss = 3.590
Epoch  29 

Epoch 244 Batch    0/15   train_loss = 0.079
Epoch 245 Batch    5/15   train_loss = 0.060
Epoch 246 Batch   10/15   train_loss = 0.058
Epoch 248 Batch    0/15   train_loss = 0.064
Epoch 249 Batch    5/15   train_loss = 0.062
Epoch 250 Batch   10/15   train_loss = 0.059
Epoch 252 Batch    0/15   train_loss = 0.064
Epoch 253 Batch    5/15   train_loss = 0.060
Epoch 254 Batch   10/15   train_loss = 0.058
Epoch 256 Batch    0/15   train_loss = 0.066
Epoch 257 Batch    5/15   train_loss = 0.061
Epoch 258 Batch   10/15   train_loss = 0.062
Epoch 260 Batch    0/15   train_loss = 0.064
Epoch 261 Batch    5/15   train_loss = 0.059
Epoch 262 Batch   10/15   train_loss = 0.060
Epoch 264 Batch    0/15   train_loss = 0.063
Epoch 265 Batch    5/15   train_loss = 0.060
Epoch 266 Batch   10/15   train_loss = 0.062
Epoch 268 Batch    0/15   train_loss = 0.064
Epoch 269 Batch    5/15   train_loss = 0.061
Epoch 270 Batch   10/15   train_loss = 0.058
Epoch 272 Batch    0/15   train_loss = 0.063
Epoch 273 

## Save Parameters
Save `seq_length` and `save_dir` for generating a new dream.

In [278]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
# Save parameters for checkpoint
helper.save_params((seq_length, save_dir))

# Checkpoint

In [279]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import tensorflow as tf
import numpy as np
import helper
import problem_unittests as tests

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
seq_length, load_dir = helper.load_params()

## Implement Generate Functions
### Get Tensors
Get tensors from `loaded_graph` using the function [`get_tensor_by_name()`](https://www.tensorflow.org/api_docs/python/tf/Graph#get_tensor_by_name).  Get the tensors using the following names:
- "input:0"
- "initial_state:0"
- "final_state:0"
- "probs:0"

Return the tensors in the following tuple `(InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)` 

In [280]:
def get_tensors(loaded_graph):
    """
    Get input, initial state, final state, and probabilities tensor from <loaded_graph>
    :param loaded_graph: TensorFlow graph loaded from file
    :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)
    """
    input = loaded_graph.get_tensor_by_name('input:0')
    initial_state = loaded_graph.get_tensor_by_name('initial_state:0')
    final_state = loaded_graph.get_tensor_by_name('final_state:0')
    probs = loaded_graph.get_tensor_by_name('probs:0')
    
    return input, initial_state, final_state, probs


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_get_tensors(get_tensors)

Tests Passed


### Choose Word
Implement the `pick_word()` function to select the next word using `probabilities`.

In [281]:
def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word in the generated text
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    # TODO: Implement Function
    idx = np.random.choice(len(int_to_vocab),p=probabilities)
    return int_to_vocab[idx]


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_pick_word(pick_word)

Tests Passed


## Generate Dream

Set `gen_length` to the length of dream desired (in words).
Set `prime_word` to any word that exists in the embedding in order to start the text.

In [313]:
gen_length = 7
# this word 
prime_word = "dream"
"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)

    # Get Tensors from loaded model
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

    # Sentences generation setup
    gen_sentences = [prime_word]
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    # Remove tokens
    dream = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        dream = dream.replace(' ' + token.lower(), key)
    dream = dream.replace('\n ', '\n')
    dream = dream.replace('( ', '(')

        
    print(dream)

dream where(colin?) calls me


# The Dream Doesn't Make Sense
Do Dreams ever make sense?