# 1. 関数の定義

In [1]:
import numpy as np

def softmax(x):
    """
    softmax関数を定義
    e_x=np.exp(x), e_x/e_x.sum(axis=0)
    """
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def smooth(loss, cur_loss):
    """
    lossを平滑化
    """
    return loss * 0.999 + cur_loss * 0.001

def print_sample(sample_ix, ix_to_char):
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    txt = txt[0].upper() + txt[1:]  # capitalize first character
    print('%s' % (txt, ), end='')

def get_initial_loss(vocab_size, seq_length):
    """
    lossを初期化
    """
    return -np.log(1.0/vocab_size)*seq_length



# 2. RNNの定義

In [2]:
class RNN:
    # 1st method
    def __init__(self, epochs=20, n_a=16, alpha=0.01, batch_size=32):
        """
        パラメータ初期化
        """
        self.epochs = epochs
        self.n_a = n_a
        self.alpha = alpha
        self.parameters = {}
        self.loss = 0.0
        self.n_x = 2
        self.n_y = 2
        self.m = batch_size
    
    # 2nd method
    def initialize_parameters(self, n_a, n_x, n_y):
        """
        Initialize parameters with small random values
        parameters -- python dictionary containing:
            Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
            Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
            Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
            b --  Bias, numpy array of shape (n_a, 1)
            by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
        """
        np.random.seed(1)
        Wax = np.random.randn(n_a, n_x)*0.01  # input to hidden
        Waa = np.random.randn(n_a, n_a)*0.01  # hidden to hidden
        Wya = np.random.randn(n_y, n_a)*0.01  # hidden to output
        ba = np.zeros((n_a, 1))  # hidden bias
        by = np.zeros((n_y, 1))  # output bias
        self.parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "ba": ba, "by": by}
        self.n_x = n_x
        self.n_y = n_y
    
    # 3rd method
    def rnn_cell_forward(self, xt, a_prev):
        """
        Implements a single forward step of the RNN-cell as described in Figure (2)
        Arguments:
        xt -- your input data at timestep "t", numpy array of shape (n_x, m).
        a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
        parameters -- python dictionary containing:
                            Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                            Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                            Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                            ba --  Bias, numpy array of shape (n_a, 1)
                            by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
        Returns:
        a_next -- next hidden state, of shape (n_a, m)
        yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
        cache -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, parameters)
        """

        # Retrieve parameters from "parameters"
        Wax = self.parameters["Wax"]
        Waa = self.parameters["Waa"]
        Wya = self.parameters["Wya"]
        ba = self.parameters["ba"]
        by = self.parameters["by"]

        # compute next activation state using the formula given above
        a_next = np.tanh(np.dot(Waa, a_prev) + np.dot(Wax, xt) + ba)
        # compute output of the current cell using the formula given above
        yt_pred = softmax(np.dot(Wya, a_next) + by)

        # store values you need for backward propagation in cache
        cache = (a_next, a_prev, xt)
        return a_next, yt_pred, cache
    
    # 4th method
    def rnn_forward(self, x, a_prev):
        """
        Arguments:
        x -- Input data for every time-step, of shape (n_x, m, T_x).
        a_prev -- Initial hidden state, of shape (n_a, m)
        parameters -- python dictionary containing:
            Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
            Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
            Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
            ba --  Bias numpy array of shape (n_a, 1)
            by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
        Returns:
        a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
        y_pred -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
        caches -- tuple of values needed for the backward pass, contains (list of caches, x)
        """

        # Initialize "caches" which will contain the list of all caches
        caches = []

        # Retrieve dimensions from shapes of x and parameters["Wya"]
        n_x, m, T_x = x.shape
        n_y, n_a = self.parameters["Wya"].shape

        # initialize "a" and "y" with zeros
        a = np.zeros((n_a, m, T_x))
        y_pred = np.zeros((n_y, m, T_x))

        # Initialize a_next (≈1 line)
        a_next = a_prev

        # loop over all time-steps
        for t in range(T_x):
            # Update next hidden state, compute the prediction, get the cache
            a_next, yt_pred, cache = self.rnn_cell_forward(xt=x[:, :, t], a_prev=a_next)
            # Save the value of the new "next" hidden state in a (≈1 line)
            a[:, :, t] = a_next
            # Save the value of the prediction in y (≈1 line)
            y_pred[:, :, t] = yt_pred

            # Append "cache" to "caches" (≈1 line)
            caches.append(cache)

        # store values needed for backward propagation in cache
        caches = (caches, x)

        return a, y_pred, caches
    
    # 5th method
    def compute_loss(self, y_hat, y):
        """
        損失関数の定義
        損失関数 -- Cross Entropy
        """
        n_y, m, T_x = y.shape
        for t in range(T_x):
            self.loss -= 1/m * np.sum(np.multiply(y[:, :, t], np.log(y_hat[:, :, t])))
        return self.loss

    # 6th method
    def rnn_cell_backward(self, dz, gradients, cache):
        """
        Implements the backward pass for the RNN-cell (single time-step).
        Arguments:
        gradients -- Gradient of loss with respect to next hidden state
        cache -- python dictionary containing useful values (output of rnn_cell_forward())
        Returns:
        gradients -- python dictionary containing:
                            dx -- Gradients of input data, of shape (n_x, m)
                            da_prev -- Gradients of previous hidden state, of shape (n_a, m)
                            dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
                            dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
                            dba -- Gradients of bias vector, of shape (n_a, 1)
        """

        # Retrieve values from cache
        (a_next, a_prev, xt) = cache

        # Retrieve values from parameters
        Wax = self.parameters["Wax"]
        Waa = self.parameters["Waa"]
        Wya = self.parameters["Wya"]
        ba = self.parameters["ba"]
        by = self.parameters["by"]

        gradients['dWya'] += np.dot(dz, a_next.T)
        gradients['dby'] += np.sum(dz, axis=1, keepdims=True)
        da = np.dot(Wya.T, dz) + gradients['da_next']

        # compute the gradient of tanh with respect to a_next (≈1 line)
        dtanh = np.multiply(da, 1 - np.square(a_next))
        # compute the gradient of the loss with respect to Wax (≈2 lines)
        gradients['dxt'] = np.dot(Wax.T, dtanh)
        gradients['dWax'] += np.dot(dtanh, xt.T)

        # compute the gradient with respect to Waa (≈2 lines)
        gradients['dWaa'] += np.dot(dtanh, a_prev.T)

        # compute the gradient with respect to b (≈1 line)
        gradients['dba'] += np.sum(dtanh, axis=1, keepdims=True)

        # compute the gradient with respect to da_next
        gradients['da_next'] = np.dot(Waa.T, dtanh)

        return gradients
    
    # 7th method
    def rnn_backward(self, y, y_hat, caches):
        """
        Implement the backward pass for a RNN over an entire sequence of input data.
        :param y: label，shape(n_y, m, T_x)
        :param y_hat: softmax rnn forward output ，shape(n_y, m, T_x)
        :param caches: tuple containing information from the forward pass (rnn_forward)
        Returns:
        gradients -- python dictionary containing:
            dx -- Gradient w.r.t. the input data, numpy-array of shape (n_x, m, T_x)
            da0 -- Gradient w.r.t the initial hidden state, numpy-array of shape (n_a, m)
            dWax -- Gradient w.r.t the input's weight matrix, numpy-array of shape (n_a, n_x)
            dWaa -- Gradient w.r.t the hidden state's weight matrix, numpy-arrayof shape (n_a, n_a)
            dba -- Gradient w.r.t the bias, of shape (n_a, 1)
            dWya -- Gradient w.r.t the output's state's weight matrix, numpy-arrayof shape (n_y, n_a)
            dby -- Gradient w.r.t the output's bias, of shape (n_y, 1)
        """

        # Retrieve values from the first cache (t=1) of caches
        (caches, x) = caches
        n_x, m, T_x = x.shape
        # initialize the gradients with the right sizes
        gradients = {}
        dx = np.zeros((n_x, m, T_x))
        gradients['dWax'] = np.zeros((self.n_a, self.n_x))
        gradients['dWaa'] = np.zeros((self.n_a, self.n_a))
        gradients['dba'] = np.zeros((self.n_a, 1))
        gradients['da_next'] = np.zeros((self.n_a, self.m))
        gradients['dWya'] = np.zeros((self.n_y, self.n_a))
        gradients['dby'] = np.zeros((self.n_y, 1))
        dz = y_hat - y  # y_hat=softmax(z), dz=dl/dy_hat * dy_hat/dz

        # Loop through all the time steps
        for t in reversed(range(T_x)):
            gradients = self.rnn_cell_backward(dz=dz[:, :, t], gradients=gradients, cache=caches[t])
            dx[:, :, t] = gradients["dxt"]

        return gradients

    # 8th method
    def clip(self, gradients, maxValue=5):
        """
        Clips the gradients' values between minimum and maximum.
        Arguments:
        gradients -- a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
        maxValue -- everything above this number is set to this number, and everything less than -maxValue is set to -maxValue
        Returns:
        gradients -- a dictionary with the clipped gradients.
        """

        dWaa, dWax, dWya, dba, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['dba'], gradients['dby']

        # clip to mitigate exploding gradients, loop over [dWax, dWaa, dWya, db, dby]. (≈2 lines)
        for gradient in [dWax, dWaa, dWya, dba, dby]:
            np.clip(gradient, -1*maxValue, maxValue, out=gradient)

        gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "dba": dba, "dby": dby}

        return gradients

    # 9th method
    def update_parameters(self, gradients):
        """
        パラメータをアップデート
        """
        self.parameters['Wax'] += -self.alpha * gradients['dWax']
        self.parameters['Waa'] += -self.alpha * gradients['dWaa']
        self.parameters['Wya'] += -self.alpha * gradients['dWya']
        self.parameters['ba'] += -self.alpha * gradients['dba']
        self.parameters['by'] += -self.alpha * gradients['dby']


    # 10th method
    def optimize(self, X, Y, a_prev):
        """
        Execute one step of the optimization to train the model.
        Arguments:
        a_prev -- previous hidden state.
        Returns:
        loss -- value of the loss function (cross-entropy)
        gradients -- python dictionary containing:
                            dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
                            dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
                            dWya -- Gradients of hidden-to-output weights, of shape (n_y, n_a)
                            db -- Gradients of bias vector, of shape (n_a, 1)
                            dby -- Gradients of output bias vector, of shape (n_y, 1)
        a[len(X)-1] -- the last hidden state, of shape (n_a, 1)
        """

        # 順伝播 
        a, y_pred, caches = self.rnn_forward(X, a_prev)
        # 損失計算
        loss = self.compute_loss(y_hat=y_pred, y=Y)
        #　Gradient計算
        gradients = self.rnn_backward(Y, y_pred, caches)
        gradients = self.clip(gradients=gradients, maxValue=5)
        
        # パラメータアップデート
        self.update_parameters(gradients)

        return loss, gradients, a[:, :, -1]

In [3]:
def sample(parameters, char_to_ix, seed):
    """
    Sample a sequence of characters according to a sequence of probability distributions output of the RNN
    Arguments:
    parameters -- python dictionary containing the parameters Waa, Wax, Wya, by, and b.
    char_to_ix -- python dictionary mapping each character to an index.
    seed -- random seed
    
    Returns:
    indices -- a list of length n containing the indices of the sampled characters.
    """

    # Retrieve parameters and relevant shapes from "parameters" dictionary
    Waa, Wax, Wya, by, ba = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['ba']
    vocab_size = by.shape[0]
    n_a = Waa.shape[1]

    # Step 1: Create the one-hot vector x for the first character (initializing the sequence generation). (≈1 line)
    x = np.zeros((vocab_size, 1))
    # Step 1': Initialize a_prev as zeros (≈1 line)
    a_prev = np.zeros((n_a, 1))

    # Create an empty list of indices, this is the list which will contain the list of indices of the characters to generate (≈1 line)
    indices = []

    # Idx is a flag to detect a newline character, we initialize it to -1
    idx = -1

    # Loop over time-steps t. At each time-step, sample a character from a probability distribution and append
    # its index to "indices". We'll stop if we reach 50 characters (which should be very unlikely with a well
    # trained model), which helps debugging and prevents entering an infinite loop.
    counter = 0
    newline_character = char_to_ix['\n']

    while idx != newline_character and counter != 50:

        # Step 2: Forward propagate x using the equations (1), (2) and (3)
        a = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + ba)
        z = np.dot(Wya, a) + by
        y = softmax(z)

        # for grading purposes
        np.random.seed(counter+seed)

        # Step 3: Sample the index of a character within the vocabulary from the probability distribution y
        idx = np.random.choice(range(vocab_size), p=y.ravel())

        # Append the index to "indices"
        indices.append(idx)

        # Step 4: Overwrite the input character as the one corresponding to the sampled index.
        x = np.zeros((vocab_size, 1))
        x[idx] = 1

        # Update "a_prev" to be "a"
        a_prev = a

        # for grading purposes
        seed += 1
        counter += 1

    if counter == 50:
        indices.append(char_to_ix['\n'])

    return indices

# 3.学習

In [10]:
"""
Trains the model and generates dinosaur names.
Arguments:
data -- text corpus
ix_to_char -- dictionary that maps the index to a character
char_to_ix -- dictionary that maps a character to an index
num_iterations -- number of iterations to train the model for
n_a -- number of units of the RNN cell
dino_names -- number of dinosaur names you want to sample at each iteration.
vocab_size -- number of unique characters found in the text, size of the vocabulary
Returns:
parameters -- learned parameters
"""

data = open('dinos.txt', 'r').read()
data = data.lower()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)

char_to_ix = {ch: i for i, ch in enumerate(sorted(chars))}
ix_to_char = {i: ch for i, ch in enumerate(sorted(chars))}
n_a=50
iter_num=60000
dino_names=7
vocab_size=27
rnn = RNN(n_a=n_a, batch_size=1)
# Retrieve n_x and n_y from vocab_size
n_x, n_y = vocab_size, vocab_size

# Initialize parameters
parameters = rnn.initialize_parameters(n_a, n_x, n_y)

# Initialize loss (this is required because we want to smooth our loss, don't worry about it)
loss = get_initial_loss(vocab_size, dino_names)

# Build list of all dinosaur names (training examples).
with open("dinos.txt") as f:
    examples = f.readlines()
examples = [x.lower().strip() for x in examples]

# Shuffle list of all dinosaur names
np.random.seed(0)
np.random.shuffle(examples)

# Initialize the hidden state of rnn
a_prev = np.zeros((n_a, 1))

# Optimization loop
for j in range(iter_num):

    # Use the hint above to define one training example (X,Y) (≈ 2 lines)
    index = j % len(examples)
    x = [None] + [char_to_ix[ch] for ch in examples[index]]
    y = x[1:] + [char_to_ix["\n"]]
    X_batch = np.zeros((n_x, 1, len(x)))
    Y_batch = np.zeros((n_y, 1, len(x)))

    # Convert x,y arrays into into one-hot arrays
    for t in range(len(x)):
        if x[t] is not None:
            X_batch[x[t], 0, t] = 1
        Y_batch[y[t], 0, t] = 1

    rnn.loss = 0
    curr_loss, gradients, a_prev = rnn.optimize(X=X_batch, Y=Y_batch, a_prev=a_prev)

    # Use a latency trick to keep the loss smooth. It happens here to accelerate the training.
    loss = smooth(loss, curr_loss)

    # Every 2000 Iteration, generate "n" characters thanks to sample() to check if the model is learning properly
    if j % 2000 == 0:
        print('Iteration: %d, Loss: %f' % (j, loss) + '\n')

        # The number of dinosaur names to print
        seed = 0
        for name in range(dino_names):

            # Sample indices and print them
            sampled_indices = sample(rnn.parameters, char_to_ix, seed)
            #print('sampled_indices: {}'.format(sampled_indices))
            print_sample(sampled_indices, ix_to_char)

            seed += 1  # To get the same result for grading purposed, increment the seed by one.

        print('\n')


Iteration: 0, Loss: 23.087336

Nkzxwtdmfqoeyhsqwasjkjvu
Kneb
Kzxwtdmfqoeyhsqwasjkjvu
Neb
Zxwtdmfqoeyhsqwasjkjvu
Eb
Xwtdmfqoeyhsqwasjkjvu


Iteration: 2000, Loss: 27.884160

Liusskeomnolxeros
Hmdaairus
Hytroligoraurus
Lecalosapaus
Xusicikoraurus
Abalpsamantisaurus
Tpraneronxeros


Iteration: 4000, Loss: 25.901815

Mivrosaurus
Inee
Ivtroplisaurus
Mbaaisaurus
Wusichisaurus
Cabaselachus
Toraperlethosdarenitochusthiamamumamaon


Iteration: 6000, Loss: 24.608779

Onwusceomosaurus
Lieeaerosaurus
Lxussaurus
Oma
Xusteonosaurus
Eeahosaurus
Toreonosaurus


Iteration: 8000, Loss: 24.070350

Onxusichepriuon
Kilabersaurus
Lutrodon
Omaaerosaurus
Xutrcheps
Edaksoje
Trodiktonus


Iteration: 10000, Loss: 23.844446

Onyusaurus
Klecalosaurus
Lustodon
Ola
Xusodonia
Eeaeosaurus
Troceosaurus


Iteration: 12000, Loss: 23.291971

Onyxosaurus
Kica
Lustrepiosaurus
Olaagrraiansaurus
Yuspangosaurus
Eealosaurus
Trognesaurus


Iteration: 14000, Loss: 23.382339

Meutromodromurus
Inda
Iutroinatorsaurus
Maca
Yusteratop