### Recurrent Neural Network (RNN) from Scratch
Following the [DeepLearning.AI course on Sequence models](https://www.coursera.org/learn/nlp-sequence-models).

At each time-step, the RNN tries to predict the next character given the previous ones.

- $X = x_1, ... x_t$
- $Y = x_2, ... x_{t+1}$

where $x_i$ is an input character from the training set.

In [79]:
import numpy as np

data = open('dinos.txt', 'r').read()
data = data.lower()
chars = sorted(list(set(data)))
vocab_size = len(chars)

print("vocab_size", vocab_size)
print(chars)

vocab_size 27
['\n', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [80]:
char2idx = {ch:i for i, ch in enumerate(chars)}
idx2char = {i:ch for i, ch in enumerate(chars)}
print(char2idx)

{'\n': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


We must clip the gradients to prevent the [Exploding Gradient problem ☠️](https://machinelearningmastery.com/exploding-gradients-in-neural-networks/).

In [81]:
def clip(gradients, maxValue):
    dWaa, dWax, dWya, db, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['dba'], gradients['dby']

    for gradient in [dWax, dWaa, dWya, db, dby]:
        np.clip(gradient, -maxValue, maxValue, gradient)

    gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "dba": db, "dby": dby}

    return gradients

To generate the next character at time-step $t+1$ we could sample with randomness (to avoid always getting the same character) from the probability distribution obtained through softmax at $Y_t$. The domain of this distributios is the vocabulary, as each character has its own probability of being the next.

In [82]:
def softmax(x):
	e_x = np.exp(x - np.max(x))
	return e_x / e_x.sum(axis=0)

def sample(parameters, char_to_ix, seed):
	Waa, Wax, Wya, by, ba = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['ba']
	vocab_size = by.shape[0]
	n_a = Waa.shape[1]

	x = np.zeros((vocab_size, 1)) # one-hot vector for the first character
	a_prev = np.zeros((n_a, 1)) # previous hidden state

	indices = []
	idx = -1

	counter = 0 # stop at 50 characters
	newline_character = char_to_ix['\n']

	while (idx != newline_character and counter != 50):
		# Forward propagate x
		a = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + ba)
		z = np.dot(Wya, a) + by
		y = softmax(z)

		# Sample the index of a character within the vocabulary from the probability distribution y
		idx = np.random.choice(vocab_size, p = y.ravel())
		indices.append(idx)

		# Overwrite the input character as the one corresponding to the sampled index.
		x = np.zeros((vocab_size, 1))
		x[idx] = 1

		# Update the hidden state
		a_prev = a

		counter += 1
	if (counter == 50):
		indices.append(char_to_ix['\n'])

	return indices

In [83]:
def rnn_step_forward(xt, a_prev, parameters):
	Wax = parameters["Wax"]
	Waa = parameters["Waa"] 
	Wya = parameters["Wya"] 
	ba = parameters["ba"]  
	by = parameters["by"]   

	a_next = np.tanh(np.dot(Waa, a_prev) + np.dot(Wax, xt) + ba) #(n_a, 1)
	yt_pred = softmax(np.dot(Wya, a_next) + by) #(n_y,1)

	return a_next, yt_pred

def rnn_forward(X, Y, a0, parameters, vocab_size=27):
    x, a, y_hat = {}, {}, {}
    a[-1] = np.copy(a0)
    loss = 0
    
    for t in range(len(X)):
        # if X[t] == None, we just have x[t]=0. This is used to set the input for the first timestep to the zero vector.
        x[t] = np.zeros((vocab_size, 1))
        if (X[t] != None):
            x[t][X[t]] = 1
        # Run one step forward of the RNN
        a[t], y_hat[t] = rnn_step_forward(x[t], a[t - 1], parameters) #a[t]: (n_a,1), y_hat[t]:(n_y,1)
        # Update the loss by substracting the cross-entropy term of this time-step from it.
        loss -= np.log(y_hat[t][Y[t], 0])

    cache = (y_hat, a, x)

    return loss, cache

def rnn_step_backward(dy, gradients, parameters, x, a, a_prev):
    gradients['dWya'] += np.dot(dy, a.T)
    gradients['dby'] += dy
    Wya = parameters['Wya']
    Waa = parameters['Waa']
    da = np.dot(Wya.T, dy) + gradients['da_next'] 
    dtanh = (1 - a * a) * da  # backprop through tanh nonlinearity
    gradients['dba'] += dtanh
    gradients['dWax'] += np.dot(dtanh, x.T)
    gradients['dWaa'] += np.dot(dtanh, a_prev.T)
    gradients['da_next'] = np.dot(Waa.T, dtanh)

    return gradients
    
def rnn_backward(X, Y, parameters, cache):
	# Initialize gradients as an empty dictionary
	gradients = {}

	# Retrieve from cache and parameters
	(y_hat, a, x) = cache
	Waa, Wax, Wya, by, ba = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['ba']

	# each one should be initialized to zeros of the same dimension as its corresponding parameter
	gradients['dWax'], gradients['dWaa'], gradients['dWya'] = np.zeros_like(Wax), np.zeros_like(Waa), np.zeros_like(Wya)
	gradients['dba'], gradients['dby'] = np.zeros_like(ba), np.zeros_like(by)
	gradients['da_next'] = np.zeros_like(a[0])

	# Backpropagate through time
	for t in reversed(range(len(X))):
		dy = np.copy(y_hat[t])
		dy[Y[t]] -= 1 
		gradients = rnn_step_backward(dy, gradients, parameters, x[t], a[t], a[t - 1])

	return gradients, a

def update_parameters(parameters, gradients, lr):
	parameters['Wax'] += -lr * gradients['dWax']
	parameters['Waa'] += -lr * gradients['dWaa']
	parameters['Wya'] += -lr * gradients['dWya']
	parameters['ba']  += -lr * gradients['dba']
	parameters['by']  += -lr * gradients['dby']
	return parameters

Let be optimized, with [Gradient Descent](https://en.wikipedia.org/wiki/Gradient_descent) as our optimization algorithm 😔🙏.

In [84]:
def optimize(X, Y, a_prev, parameters, learning_rate = 0.01):
    loss, cache = rnn_forward(X, Y, a_prev, parameters)
    
    gradients, a = rnn_backward(X, Y, parameters, cache)
    
    gradients = clip(gradients, 5)
    
    parameters = update_parameters(parameters, gradients, learning_rate)
        
    return loss, gradients, a[len(X)-1]

In [85]:
def initialize_parameters(n_a, n_x, n_y):
	np.random.seed(1)
	Wax = np.random.randn(n_a, n_x) * 0.01  # input to hidden
	Waa = np.random.randn(n_a, n_a) * 0.01  # hidden to hidden
	Wya = np.random.randn(n_y, n_a) * 0.01  # hidden to output
	ba = np.zeros((n_a, 1))  # hidden bias
	by = np.zeros((n_y, 1))  # output bias
	parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "ba": ba, "by": by}

	return parameters

def get_initial_loss(vocab_size, seq_length):
	return -np.log(1.0/vocab_size)*seq_length

def smooth(loss, cur_loss):
	return loss * 0.999 + cur_loss * 0.001

def get_sample(sample_ix, ix_to_char):
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    txt = txt[0].upper() + txt[1:]  # capitalize first character 
    return txt

def model(data_x, ix_to_char, char_to_ix, num_iterations = 35000, n_a = 50, dino_names = 7, vocab_size = 27, verbose = False):
    n_x, n_y = vocab_size, vocab_size
    
    parameters = initialize_parameters(n_a, n_x, n_y)
    
    loss = get_initial_loss(vocab_size, dino_names)
    
    examples = [x.strip() for x in data_x]
    
    np.random.seed(0)
    np.random.shuffle(examples)
    
    a_prev = np.zeros((n_a, 1))
    
    last_dino_name = "abc"
    
    for j in range(num_iterations):
        
        idx = j % len(examples)
        
        single_example = examples[idx]
        single_example_chars = [char for char in single_example]
        single_example_ix = [char_to_ix[char] for char in single_example]
        X = [None] + single_example_ix
        
        ix_newline = char_to_ix['\n']
        Y = X[1:] + [ix_newline]

        curr_loss, gradients, a_prev = optimize(X, Y, a_prev, parameters, learning_rate=0.01)
        
        if verbose and j in [0, len(examples) -1, len(examples)]:
            print("j = " , j, "idx = ", idx,) 
        if verbose and j in [0]:
            print("single_example =", single_example)
            print("single_example_chars", single_example_chars)
            print("single_example_ix", single_example_ix)
            print(" X = ", X, "\n", "Y =       ", Y, "\n")
        
        loss = smooth(loss, curr_loss)

        if j % 2000 == 0:
            
            print('Iteration: %d, Loss: %f' % (j, loss) + '\n')
            
            #  The number of dinosaur names to print
            seed = 0
            for name in range(dino_names):
                
                # Sample indices and print them
                sampled_indices = sample(parameters, char_to_ix, seed)
                last_dino_name = get_sample(sampled_indices, ix_to_char)
                print(last_dino_name.replace('\n', ''))
                
                seed += 1  # To get the same result (for grading purposes), increment the seed by one. 
      
            print('\n')
        
    return parameters, last_dino_name

In [86]:
parameters, last_name = model(data.split("\n"), idx2char, char2idx, 22001, verbose = True)

j =  0 idx =  0
single_example = turiasaurus
single_example_chars ['t', 'u', 'r', 'i', 'a', 's', 'a', 'u', 'r', 'u', 's']
single_example_ix [20, 21, 18, 9, 1, 19, 1, 21, 18, 21, 19]
 X =  [None, 20, 21, 18, 9, 1, 19, 1, 21, 18, 21, 19] 
 Y =        [20, 21, 18, 9, 1, 19, 1, 21, 18, 21, 19, 0] 

Iteration: 0, Loss: 23.087336

Nkzxwtdmfqoeyhsqwasjkjvu
Kneb
Kzxwtdmfqoeyhsqwasjkjvu
Neb
Zxwtdmfqoeyhsqwasjkjvu
Eb
Xwtdmfqoeyhsqwasjkjvu




j =  1535 idx =  1535
j =  1536 idx =  0
Iteration: 2000, Loss: 27.884160

Liusskeomnolxeros
Hmdaairus
Hytroligoraurus
Lecalosapaus
Xusicikoraurus
Abalpsamantisaurus
Tpraneronxeros


Iteration: 4000, Loss: 25.901815

Mivrosaurus
Inee
Ivtroplisaurus
Mbaaisaurus
Wusichisaurus
Cabaselachus
Toraperlethosdarenitochusthiamamumamaon


Iteration: 6000, Loss: 24.608779

Onwusceomosaurus
Lieeaerosaurus
Lxussaurus
Oma
Xusteonosaurus
Eeahosaurus
Toreonosaurus


Iteration: 8000, Loss: 24.070350

Onxusichepriuon
Kilabersaurus
Lutrodon
Omaaerosaurus
Xutrcheps
Edaksoje
Trodiktonus


Iteration: 10000, Loss: 23.844446

Onyusaurus
Klecalosaurus
Lustodon
Ola
Xusodonia
Eeaeosaurus
Troceosaurus


Iteration: 12000, Loss: 23.291971

Onyxosaurus
Kica
Lustrepiosaurus
Olaagrraiansaurus
Yuspangosaurus
Eealosaurus
Trognesaurus


Iteration: 14000, Loss: 23.382338

Meutromodromurus
Inda
Iutroinatorsaurus
Maca
Yusteratoptititan
Ca
Troclosaurus


Iteration: 16000, Loss: 23.265759

Meustoloplohus
Imeda
Iutosaurus
Maca
