In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import time
sns.set()

In [2]:
def get_vocab(file, lower = False):
    with open(file, 'r') as fopen:
        data = fopen.read()
    if lower:
        data = data.lower()
    vocab = list(set(data))
    return data, vocab

def embed_to_onehot(data, vocab):
    onehot = np.zeros((len(data), len(vocab)), dtype = np.float32)
    for i in range(len(data)):
        onehot[i, vocab.index(data[i])] = 1.0
    return onehot

In [3]:
text, text_vocab = get_vocab('consumer.h', lower = False)
onehot = embed_to_onehot(text, text_vocab)

In [4]:
learning_rate = 0.001
batch_size = 64
sequence_length = 12
epoch = 1000
num_layers = 2
size_layer = 128
possible_batch_id = range(len(text) - sequence_length - 1)
dimension = onehot.shape[1]

U = np.random.randn(size_layer, dimension) / np.sqrt(size_layer)
Wz = np.random.randn(size_layer, size_layer) / np.sqrt(size_layer)
Wr = np.random.randn(size_layer, size_layer) / np.sqrt(size_layer)
Wh = np.random.randn(size_layer, size_layer) / np.sqrt(size_layer)
V = np.random.randn(dimension, size_layer) / np.sqrt(dimension)

In [5]:
def tanh(x, grad=False):
    if grad:
        output = np.tanh(x)
        return (1.0 - np.square(output))
    else:
        return np.tanh(x)
    
def sigmoid(x, grad=False):
    if grad:
        return sigmoid(x) * (1 - sigmoid(x))
    else:
        return 1 / (1 + np.exp(-x))
    
def softmax(x):
    exp_scores = np.exp(x - np.max(x))
    return exp_scores / (np.sum(exp_scores, axis=1, keepdims=True) + 1e-8)

def derivative_softmax_cross_entropy(x, y):
    delta = softmax(x)
    delta[range(X.shape[0]), y] -= 1
    return delta

def forward_multiply_gate(w, x):
    return np.dot(w, x)

def backward_multiply_gate(w, x, dz):
    dW = np.dot(dz.T, x)
    dx = np.dot(w.T, dz.T)
    return dW, dx

def forward_add_gate(x1, x2):
    return x1 + x2

def backward_add_gate(x1, x2, dz):
    dx1 = dz * np.ones_like(x1)
    dx2 = dz * np.ones_like(x2)
    return dx1, dx2

def cross_entropy(Y_hat, Y, epsilon=1e-12):
    Y_hat = np.clip(Y_hat, epsilon, 1. - epsilon)
    N = Y_hat.shape[0]
    return -np.sum(np.sum(Y * np.log(Y_hat+1e-9))) / N

def forward_recurrent(x, h_state, U, Wz, Wr, Wh, V):
    mul_u = forward_multiply_gate(x, U.T)
    mul_Wz = forward_multiply_gate(h_state, Wz.T)
    add_Wz = forward_add_gate(mul_u, mul_Wz)
    z = sigmoid(add_Wz)
    mul_Wr = forward_multiply_gate(h_state, Wr.T)
    add_Wr = forward_add_gate(mul_u, mul_Wr)
    r = sigmoid(add_Wr)
    mul_Wh = forward_multiply_gate(h_state * r, Wh.T)
    add_Wh = forward_add_gate(mul_u, mul_Wh)
    h_hat = tanh(add_Wh)
    h = (1 - z) * h_state + z * h_hat
    mul_v = forward_multiply_gate(h, V.T)
    return (mul_u, mul_Wz, add_Wz, z, mul_Wr, add_Wr, r, mul_Wh, add_Wh, h_hat, h, mul_v)

def backward_recurrent(x, h_state, U, Wz, Wr, Wh, V, d_mul_v, saved_graph):
    mul_u, mul_Wz, add_Wz, z, mul_Wr, add_Wr, r, mul_Wh, add_Wh, h_hat, h, mul_v = saved_graph
    dV, dh = backward_multiply_gate(V, h, d_mul_v)
    dh_hat = z * dh.T
    dadd_Wh = tanh(add_Wh, True) * dh_hat
    dmul_u1, dmul_Wh = backward_add_gate(mul_u, mul_Wh, dadd_Wh)
    dWh, dprev_state = backward_multiply_gate(Wh, h_state * r, dmul_Wh)
    dr = dprev_state * h_state.T
    dadd_Wr = sigmoid(add_Wr, True) * dr.T
    dmul_u2, dmul_Wr = backward_add_gate(mul_u, mul_Wr, dadd_Wr)
    dWr, dprev_state = backward_multiply_gate(Wr, h_state, dmul_Wr)
    dz = -h_state + h_hat
    dadd_Wz = sigmoid(add_Wz, True) * dz
    dmul_u3, dmul_Wz = backward_add_gate(mul_u, mul_Wz, dadd_Wz)
    dWz, dprev_state = backward_multiply_gate(Wz, h_state, dmul_Wz)
    dU, dx = backward_multiply_gate(U, x, dmul_u3)
    return (dU, dWz, dWr, dWh, dV)

In [6]:
for i in range(epoch):
    batch_x = np.zeros((batch_size, sequence_length, dimension))
    batch_y = np.zeros((batch_size, sequence_length, dimension))
    batch_id = random.sample(possible_batch_id, batch_size)
    prev_h = np.zeros((batch_size, size_layer))
    for n in range(sequence_length):
        id1 = [k + n for k in batch_id]
        id2 = [k + n + 1 for k in batch_id]
        batch_x[:,n,:] = onehot[id1, :]
        batch_y[:,n,:] = onehot[id2, :]
    layers = []
    out_logits = np.zeros((batch_size, sequence_length, dimension))
    for n in range(sequence_length):
        layers.append(forward_recurrent(batch_x[:,n,:], prev_h, U, Wz, Wr, Wh, V))
        prev_h = layers[-1][-2]
        out_logits[:, n, :] = layers[-1][-1]
    probs = softmax(out_logits.reshape((-1, dimension)))
    y = np.argmax(batch_y.reshape((-1, dimension)),axis=1)
    accuracy = np.mean(np.argmax(probs,axis=1) == y)
    loss = cross_entropy(probs, batch_y.reshape((-1, dimension)))
    delta = probs
    delta[range(y.shape[0]), y] -= 1
    delta = delta.reshape((batch_size, sequence_length, dimension))
    dU = np.zeros(U.shape)
    dV = np.zeros(V.shape)
    dWz = np.zeros(Wz.shape)
    dWr = np.zeros(Wr.shape)
    dWh = np.zeros(Wh.shape)
    prev_h = np.zeros((batch_size, size_layer))
    for n in range(sequence_length):
        d_mul_v = delta[:, n, :]
        dU_t, dWz_t, dWr_t, dWh_t, dV_t = backward_recurrent(batch_x[:,n,:], prev_h, 
                                                                    U, Wz, Wr, Wh, V, d_mul_v, layers[n])
        prev_h = layers[n][-2]
        dU += dU_t
        dV += dV_t
        dWz += dWz_t
        dWr += dWr_t
        dWh += dWh_t
    U -= learning_rate * dU
    V -= learning_rate * dV
    Wz -= learning_rate * dWz
    Wr -= learning_rate * dWr
    Wh -= learning_rate * dWh
    if (i+1) % 50 == 0:
        print('epoch %d, loss %f, accuracy %f'%(i+1, loss, accuracy))

epoch 50, loss 4.220839, accuracy 0.223958
epoch 100, loss 4.151175, accuracy 0.196615
epoch 150, loss 3.968643, accuracy 0.158854
epoch 200, loss 4.017142, accuracy 0.191406
epoch 250, loss 4.093218, accuracy 0.182292
epoch 300, loss 4.048298, accuracy 0.134115
epoch 350, loss 3.878078, accuracy 0.130208
epoch 400, loss 3.772771, accuracy 0.085938
epoch 450, loss 3.726045, accuracy 0.106771
epoch 500, loss 3.694129, accuracy 0.109375
epoch 550, loss 3.727348, accuracy 0.075521
epoch 600, loss 3.521524, accuracy 0.095052
epoch 650, loss 3.571376, accuracy 0.106771
epoch 700, loss 3.573480, accuracy 0.085938
epoch 750, loss 3.590403, accuracy 0.123698
epoch 800, loss 3.614294, accuracy 0.111979
epoch 850, loss 3.564591, accuracy 0.123698
epoch 900, loss 3.591091, accuracy 0.111979
epoch 950, loss 3.543047, accuracy 0.105469
epoch 1000, loss 3.542467, accuracy 0.102865
