In [None]:
# Importing Libraries
import numpy as np
import random
import json
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
with open('dinosaurs.json') as json_data:
    d = json.load(json_data)
    print(d)

In [None]:
data = d['dinosaurs']

In [None]:
file = open('dinotext.txt','a') 

In [None]:
for i in data:
    file.write(i)
    file.write('\n')
file.close()

In [None]:
data = open('dinotext.txt', 'r').read()
data = data.lower()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))

In [None]:
# Character to index and index to character
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)

In [None]:
# Clips the gradients' values between minimum and maximum.
def clip(gradients, maxValue):
    # Arguments:
    # gradients -- a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
    # maxValue -- everything above this number is set to this number, and everything less than -maxValue is set to -maxValue
    dWaa, dWax, dWya, db, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['db'], gradients['dby']
   
    # clip to mitigate exploding gradients, loop over [dWax, dWaa, dWya, db, dby]
    for gradient in [dWax, dWaa, dWya, db, dby]:
        np.clip(gradient, -maxValue, maxValue, out=gradient)
    
    gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "db": db, "dby": dby}
    
    return gradients # gradients -- a dictionary with the clipped gradients.

In [None]:
# testing clip()
np.random.seed(3)
dWax = np.random.randn(5,3)*10
dWaa = np.random.randn(5,5)*10
dWya = np.random.randn(2,5)*10
db = np.random.randn(5,1)*10
dby = np.random.randn(2,1)*10
gradients = {"dWax": dWax, "dWaa": dWaa, "dWya": dWya, "db": db, "dby": dby}
gradients = clip(gradients, 10)
print("gradients[\"dWaa\"][1][2] =", gradients["dWaa"][1][2])
print("gradients[\"dWax\"][3][1] =", gradients["dWax"][3][1])
print("gradients[\"dWya\"][1][2] =", gradients["dWya"][1][2])
print("gradients[\"db\"][4] =", gradients["db"][4])
print("gradients[\"dby\"][1] =", gradients["dby"][1])

In [None]:
# Compute softmax values for each sets of scores in x
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [None]:
# Sample a sequence of characters according to a sequence of probability distributions output of the RNN

def sample(parameters, char_to_ix, seed):
    # Arguments:
    # parameters -- python dictionary containing the parameters Waa, Wax, Wya, by, and b. 
    # char_to_ix -- python dictionary mapping each character to an index.
    # seed -- used for grading purposes. Do not worry about it.
  
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    vocab_size = by.shape[0]
    n_a = Waa.shape[1]
    
    x = np.zeros((vocab_size, 1)) # Create the one-hot vector x for the first character (initializing the sequence generation)
    
    a_prev = np.zeros((n_a, 1)) # Initialize a_prev as zeros
    
    indices = []
        
    idx = -1  # Idx is a flag to detect a newline character, we initialize it to -1
    
    # Loop over time-steps t. At each time-step, sample a character from a probability distribution and append 
    # its index to "indices". We'll stop if we reach 50 characters (which should be very unlikely with a well 
    # trained model), which helps debugging and prevents entering an infinite loop. 
    counter = 0
    newline_character = char_to_ix['\n']
    
    while (idx != newline_character and counter != 50):
        
        
        a = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + b) # Forward propagate x
        z = np.dot(Wya, a) + by
        y = softmax(z)
        
        np.random.seed(counter + seed)  # for grading purposes
        
        idx = np.random.choice(list(range(vocab_size)), p=y.ravel()) # Sample the index of a character within the vocabulary from the probability distribution y

        indices.append(idx) # Append the index to "indices"
        
        x = np.zeros((vocab_size, 1)) # Overwrite the input character as the one corresponding to the sampled index.
        x[idx] = 1
        
        a_prev = a # Update "a_prev" to be "a"
        
        seed += 1 # for grading purposes
        counter +=1

    if (counter == 50):
        indices.append(char_to_ix['\n'])
    
    return indices # indices -- a list of length n containing the indices of the sampled characters.

In [None]:
# testing sample()
np.random.seed(2)
_, n_a = 20, 100
Wax, Waa, Wya = np.random.randn(n_a, vocab_size), np.random.randn(n_a, n_a), np.random.randn(vocab_size, n_a)
b, by = np.random.randn(n_a, 1), np.random.randn(vocab_size, 1)
parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b, "by": by}


indices = sample(parameters, char_to_ix, 0)
print("Sampling:")
print("list of sampled indices:", indices)
print("list of sampled characters:", [ix_to_char[i] for i in indices])

In [None]:
# Implements a single forward step of the RNN-cell

def rnn_cell_forward(xt, a_prev, parameters):
    
    # Arguments:
    # xt -- your input data at timestep "t", numpy array of shape (n_x, m).
    # a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
    # parameters -- python dictionary containing:
    #                    Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
    #                    Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
    #                    Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
    #                    ba --  Bias, numpy array of shape (n_a, 1)
    #                    by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]
    
    a_next = np.tanh(np.dot(Wax, xt) + np.dot(Waa, a_prev) + ba) # compute next activation state
    yt_pred = softmax(np.dot(Wya, a_next) + by) # compute output of the current cell
    
    cache = (a_next, a_prev, xt, parameters) # store values you need for backward propagation in cache
    
    # Return:
    # a_next -- next hidden state, of shape (n_a, m)
    # yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
    # cache -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, parameters)
    return a_next, yt_pred, cache

In [None]:
# testing rnn_cell_forward()
np.random.seed(1)
xt = np.random.randn(3,10)
a_prev = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}

a_next, yt_pred, cache = rnn_cell_forward(xt, a_prev, parameters)
print("a_next[4] = ", a_next[4])
print("a_next.shape = ", a_next.shape)
print("yt_pred[1] =", yt_pred[1])
print("yt_pred.shape = ", yt_pred.shape)

In [None]:
# Implement the forward propagation of the recurrent neural network

def rnn_forward(x, a0, parameters):
    
    # Arguments:
    # x -- Input data for every time-step, of shape (n_x, m, T_x).
    # a0 -- Initial hidden state, of shape (n_a, m)
    # parameters -- python dictionary containing:
    #                    Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
    #                    Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
    #                    Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
    #                    ba --  Bias numpy array of shape (n_a, 1)
    #                    by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)

    
    caches = [] # Initialize "caches" which will contain the list of all caches
    
    n_x, m, T_x = x.shape # Retrieve dimensions from shapes of x and Wy
    n_y, n_a = parameters["Wya"].shape
    
    a = np.zeros((n_a, m, T_x)) # initialize "a" and "y" with zeros
    y_pred = np.zeros((n_y, m, T_x))
    
    a_next = a0 # Initialize a_next
    
    # loop over all time-steps
    for t in range(T_x):
        
        a_next, yt_pred, cache = rnn_cell_forward(x[:,:,t], a_next, parameters) # Update next hidden state, compute the prediction, get the cache
        a[:,:,t] = a_next # Save the value of the new "next" hidden state in a        
        y_pred[:,:,t] = yt_pred # Save the value of the prediction in y
        caches.append(cache) # Append "cache" to "caches"
        
    caches = (caches, x) # store values needed for backward propagation in cache
    
    # Returns:
    # a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
    # y_pred -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
    # caches -- tuple of values needed for the backward pass, contains (list of caches, x)
    return a, y_pred, caches

In [None]:
# testing rnn_forward()

np.random.seed(1)
x = np.random.randn(3,10,4)
a0 = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}

a, y_pred, caches = rnn_forward(x, a0, parameters)
print("a[4][1] = ", a[4][1])
print("a.shape = ", a.shape)
print("y_pred[1][3] =", y_pred[1][3])
print("y_pred.shape = ", y_pred.shape)
print("caches[1][1][3] =", caches[1][1][3])
print("len(caches) = ", len(caches))

In [None]:
# Implements the backward pass for the RNN-cell (single time-step)
def rnn_cell_backward(da_next, cache):
    # Arguments:
    # da_next -- Gradient of loss with respect to next hidden state
    # cache -- python dictionary containing useful values (output of rnn_cell_forward())
     
    
    # Retrieve values from cache
    (a_next, a_prev, xt, parameters) = cache
    
    # Retrieve values from parameters
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]
    
    dtanh = (1-a_next*a_next)*da_next # compute the gradient of tanh with respect to a_next 
    
    dxt = np.dot(Wax.T,  dtanh) # compute the gradient of the loss with respect to Wax
    dWax = np.dot(dtanh,xt.T)
    
    da_prev = np.dot(Waa.T, dtanh)   # compute the gradient with respect to Waa
    dWaa = np.dot( dtanh,a_prev.T)
    
    dba = np.sum( dtanh,keepdims=True,axis=-1) # compute the gradient with respect to b 
        
    gradients = {"dxt": dxt, "da_prev": da_prev, "dWax": dWax, "dWaa": dWaa, "dba": dba} # Store the gradients in a python dictionary
    
    # Returns:
    # gradients -- python dictionary containing:
    #                     dx -- Gradients of input data, of shape (n_x, m)
    #                     da_prev -- Gradients of previous hidden state, of shape (n_a, m)
    #                     dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
    #                     dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
    #                     dba -- Gradients of bias vector, of shape (n_a, 1)
    return gradients

In [None]:
np.random.seed(1)
xt = np.random.randn(3,10)
a_prev = np.random.randn(5,10)
Wax = np.random.randn(5,3)
Waa = np.random.randn(5,5)
Wya = np.random.randn(2,5)
b = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "ba": ba, "by": by}

a_next, yt, cache = rnn_cell_forward(xt, a_prev, parameters)

da_next = np.random.randn(5,10)
gradients = rnn_cell_backward(da_next, cache)


In [None]:
# Implement the backward pass for a RNN over an entire sequence of input data
def rnn_backward(da, caches):

    # Arguments:
    # da -- Upstream gradients of all hidden states, of shape (n_a, m, T_x)
    # caches -- tuple containing information from the forward pass (rnn_forward)
    
    (caches, x) = caches
    (a1, a0, x1, parameters) = caches[0]  # Retrieve values from the first cache (t=1) of caches

    n_a, m, T_x = da.shape
    n_x, m = x1.shape 
    
    # initialize the gradients with the right sizes
    dx = np.zeros((n_x, m, T_x)) 
    dWax = np.zeros((n_a, n_x))
    dWaa = np.zeros((n_a, n_a))
    dba = np.zeros((n_a, 1)) 
    da0 = np.zeros((n_a, m))
    da_prevt = np.zeros((n_a, m))  
    
    # Loop through all the time steps
    for t in reversed(range(T_x)):
        gradients = rnn_cell_backward(da[:, :, t] + da_prevt, caches[t])
        dxt, da_prevt, dWaxt, dWaat, dbat = gradients["dxt"], gradients["da_prev"], gradients["dWax"], gradients["dWaa"], gradients["dba"]
        dx[:, :, t] = dxt  
        dWax += dWaxt  
        dWaa += dWaat  
        dba += dbat  
        
    # Set da0 to the gradient of a which has been backpropagated through all time-steps 
    da0 = Non

    gradients = {"dx": dx, "da0": da0, "dWax": dWax, "dWaa": dWaa,"dba": dba}
    
    # Returns:
    # gradients -- python dictionary containing:
    #                     dx -- Gradient w.r.t. the input data, numpy-array of shape (n_x, m, T_x)
    #                     da0 -- Gradient w.r.t the initial hidden state, numpy-array of shape (n_a, m)
    #                     dWax -- Gradient w.r.t the input's weight matrix, numpy-array of shape (n_a, n_x)
    #                     dWaa -- Gradient w.r.t the hidden state's weight matrix, numpy-arrayof shape (n_a, n_a)
    #                     dba -- Gradient w.r.t the bias, of shape (n_a, 1)
    return gradients

In [None]:
np.random.seed(1)
x = np.random.randn(3,10,4)
a0 = np.random.randn(5,10)
Wax = np.random.randn(5,3)
Waa = np.random.randn(5,5)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "ba": ba, "by": by}
a, y, caches = rnn_forward(x, a0, parameters)
da = np.random.randn(5, 10, 4)
gradients = rnn_backward(da, caches)

In [None]:
# Execute one step of the optimization to train the model.

def optimize(X, Y, a_prev, parameters, learning_rate = 0.01):
    
    # Arguments:
    # X -- list of integers, where each integer is a number that maps to a character in the vocabulary.
    # Y -- list of integers, exactly the same as X but shifted one index to the left.
    # a_prev -- previous hidden state.
    # parameters -- python dictionary containing:
    #                     Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
    #                     Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
    #                     Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
    #                     b --  Bias, numpy array of shape (n_a, 1)
    #                     by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    # learning_rate -- learning rate for the model.
    
    print(type(X))
    # Forward propagate through time 
    loss, cache = rnn_forward(X, a_prev, parameters)
    
    # Backpropagate through time 
    gradients, a = rnn_backward(X, Y, parameters, cache)
    
    # Clip your gradients between -5 (min) and 5 (max) 
    gradients = clip(gradients, 5)
    
    # Update parameters 
    parameters = update_parameters(parameters, gradients, learning_rate)
    
    # Returns:
    # loss -- value of the loss function (cross-entropy)
    # gradients -- python dictionary containing:
    #                     dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
    #                     dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
    #                     dWya -- Gradients of hidden-to-output weights, of shape (n_y, n_a)
    #                     db -- Gradients of bias vector, of shape (n_a, 1)
    #                     dby -- Gradients of output bias vector, of shape (n_y, 1)
    # a[len(X)-1] -- the last hidden state, of shape (n_a, 1)
    return loss, gradients, a[len(X)-1]

In [None]:
# Testing Optimize
np.random.seed(1)
vocab_size, n_a = 27, 100
a_prev = np.random.randn(n_a, 1)
Wax, Waa, Wya = np.random.randn(n_a, vocab_size), np.random.randn(n_a, n_a), np.random.randn(vocab_size, n_a)
b, by = np.random.randn(n_a, 1), np.random.randn(vocab_size, 1)
parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b, "by": by}
X = [12,3,5,11,22,3]
Y = [4,14,11,22,25, 26]

loss, gradients, a_last = optimize(X, Y, a_prev, parameters, learning_rate = 0.01)
print("Loss =", loss)
print("gradients[\"dWaa\"][1][2] =", gradients["dWaa"][1][2])
print("np.argmax(gradients[\"dWax\"]) =", np.argmax(gradients["dWax"]))
print("gradients[\"dWya\"][1][2] =", gradients["dWya"][1][2])
print("gradients[\"db\"][4] =", gradients["db"][4])
print("gradients[\"dby\"][1] =", gradients["dby"][1])
print("a_last[4] =", a_last[4])

In [None]:
# Trains the model and generates dinosaur names

def model(data, ix_to_char, char_to_ix, num_iterations = 35000, n_a = 50, dino_names = 7, vocab_size = 27):
    
    # Arguments:
    # data -- text corpus
    # ix_to_char -- dictionary that maps the index to a character
    # char_to_ix -- dictionary that maps a character to an index
    # num_iterations -- number of iterations to train the model for
    # n_a -- number of units of the RNN cell
    # dino_names -- number of dinosaur names you want to sample at each iteration. 
    # vocab_size -- number of unique characters found in the text, size of the vocabulary

    n_x, n_y = vocab_size, vocab_size
    
    parameters = initialize_parameters(n_a, n_x, n_y)
    
    # Initialize loss (this is required because we want to smooth our loss)
    loss = get_initial_loss(vocab_size, dino_names)
    
    # Build list of all dinosaur names (training examples)
    with open("dinos.txt") as f:
        examples = f.readlines()
    examples = [x.lower().strip() for x in examples]
    
    # Shuffle list of all dinosaur names
    np.random.seed(0)
    np.random.shuffle(examples)
    
    # Initialize the hidden state of your LSTM
    a_prev = np.zeros((n_a, 1))
    
    # Optimization loop
    for j in range(num_iterations):
        
        # Use the hint above to define one training example (X,Y)
        index = j % len(examples)
        X = [None] + [char_to_ix[ch] for ch in examples[index]] 
        Y = X[1:] + [char_to_ix["\n"]]
        
        # Perform one optimization step: Forward-prop -> Backward-prop -> Clip -> Update parameters
        # Choose a learning rate of 0.01
        curr_loss, gradients, a_prev = optimize(X, Y, a_prev, parameters)
        
        # Use a latency trick to keep the loss smooth. It happens here to accelerate the training.
        loss = smooth(loss, curr_loss)

        # Every 2000 Iteration, generate "n" characters thanks to sample() to check if the model is learning properly
        if j % 2000 == 0:
            
            print('Iteration: %d, Loss: %f' % (j, loss) + '\n')
            
            # The number of dinosaur names to print
            seed = 0
            for name in range(dino_names):
                
                # Sample indices and print them
                sampled_indices = sample(parameters, char_to_ix, seed)
                print_sample(sampled_indices, ix_to_char)
                
                seed += 1  # To get the same result for grading purposed, increment the seed by one. 
      
            print('\n')
        
    return parameters

In [None]:
# Still working on the code --V
parameters = model(data, ix_to_char, char_to_ix)

In [None]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Activation, Dropout, Input, Masking
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from shakespeare_utils import *
import sys
import io

In [None]:

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y, batch_size=128, epochs=1, callbacks=[print_callback])

In [None]:
generate_output()