# A theano GRU implementation

In [1]:
from theano.sandbox import cuda
cuda.use('gpu1')

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [2]:
%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

Using Theano backend.


## Get data

In [3]:
path = get_file('kafka.txt', origin="http://www.gutenberg.org/cache/epub/22367/pg22367.txt")
text = open(path).read().lower()
text = text[1200:-19500]
print('corpus length:', len(text))

corpus length: 125146


In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
chars.insert(0, "\0")
print('total chars:', vocab_size)
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 48


Translate all character to the respective id

In [5]:
id_set = [char_indices[c] for c in text]

Create input ouput arrays, first x character then the next character which is the labels

In [6]:
n_hidden = 256
n_input = vocab_size
n_output = vocab_size

In [7]:
n_pred=8

In [8]:
c_in_dat = [[id_set[i+n] for i in xrange(0, len(id_set)-1-n_pred, n_pred)]
            for n in range(n_pred)]
c_label_dat = [[id_set[i+n] for i in xrange(1, len(id_set)-n_pred, n_pred)]
            for n in range(n_pred)]

ys = [np.stack(c[:-2]) for c in c_label_dat]
xs = [np.stack(c[:-2]) for c in c_in_dat]

oh_labels = [to_categorical(o, vocab_size) for o in ys]
oh_labels_rnn=np.stack(oh_labels, axis=1)

oh_inputs = [to_categorical(o, vocab_size) for o in xs]
oh_inputs_rnn=np.stack(oh_inputs, axis=1)

## Weights

```shared``` says that the variable will be moved to the GPU

In [9]:
def init_weight_gates(rows, cols): 
    scale = math.sqrt(2/rows)
    return shared(normal(scale=scale, size=(rows, cols)).astype(np.float32))
def init_bias(rows): 
    return shared(np.zeros(rows, dtype=np.float32))
def init_ids(n):
    return shared(np.eye(n, dtype=np.float32))
def wgts_and_bias(n_in, n_out): 
    return init_weight_gates(n_in, n_out), init_bias(n_out)
def id_and_bias(n): 
    return shared(np.eye(n, dtype=np.float32)), init_bias(n)

Inputs,weights and biases

In [10]:
t_inp = T.matrix('inp')
t_outp = T.matrix('outp')
t_h0 = T.vector('h0')
lr = T.scalar('lr')

all_args = [t_h0, t_inp, t_outp, lr]

In [11]:
W_bias_update = init_weight_gates(n_input, n_hidden), init_bias(n_hidden)
U_update = init_weight_gates(n_hidden, n_hidden)

W_bias_reset = init_weight_gates(n_input, n_hidden), init_bias(n_hidden)
U_reset = init_weight_gates(n_hidden, n_hidden)

W_y_bias_output = init_weight_gates(n_hidden, n_output), init_bias(n_output)
W_output = init_weight_gates(n_input, n_hidden)
U_bias_output = init_ids(n_hidden), init_bias(n_hidden)
w_all = list(chain.from_iterable([U_bias_output, W_y_bias_output, W_bias_update, W_bias_reset]))
w_all.extend([W_output, U_update, U_reset])

## Gate

In [12]:
def gate(x, h, W_h, W_x, b_x):
    return nnet.sigmoid(T.dot(x, W_x) + b_x + T.dot(h, W_h))

## Step

This happens for each layer, here we describe the full gate

In [13]:
def step(x, h, U_output, bias_output, W_y_output, bias_y_output, W_update, bias_update, W_reset, bias_reset, W_output, U_update, U_reset):
    update = gate(x, h, U_update, W_update, bias_update)
    reset = gate(x, h, U_reset, W_reset, bias_reset)
    h_new = gate(x, (reset * h), U_output, W_output, bias_output)
    h = update*h + (1 - update)*h_new
    y = nnet.softmax(T.dot(h, W_y_output) + bias_y_output)
    return h, T.flatten(y, 1)

In [14]:
[v_h, v_y], _ = theano.scan(step, sequences=t_inp, 
                            outputs_info=[t_h0, None], non_sequences=w_all)

Calculating loss using cross entropy

In [15]:
error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)

Need to create an update function for theano, SGD that is run

In [16]:
def SGD_update(wgts, grads, lr): 
    return OrderedDict({w: w-g*lr for (w,g) in zip(wgts,grads)})

upd = SGD_update(w_all, g_all, lr)

comple time, all_args is the input, output, and init. Error is the error calculation fn, categorical crossentropy, update is our SGD, allow_input_downcast allows it to recast the input to fit

In [21]:
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)
f_y = theano.function([t_h0, t_inp], v_y, allow_input_downcast=True)

In [18]:
def train(fn, input_x, labels, epochs, l_rate = 0.01):
    err=0.0
    for i in range(len(input_x)): 
        err+=fn(np.zeros(n_hidden), input_x[i], labels[i], l_rate)
        if i % 1000 == 999: 
            print ("Error:{:.3f}".format(err/1000))
            err=0.0

In [30]:
def predict(f_y, input_vec):
    pred = np.argmax(f_y(np.zeros(n_hidden), input_vec), axis=1)
    actual = np.argmax(input_vec, axis=1)
    prediction = [indices_char[o] for o in pred]
    actual = [indices_char[o] for o in actual]
    return ''.join(actual), ''.join(prediction) 

In [19]:
train(fn, oh_inputs_rnn, oh_labels_rnn, 1)

Error:25.736
Error:24.169
Error:23.066
Error:21.898
Error:21.230
Error:20.480
Error:19.739
Error:19.270
Error:19.045
Error:19.190
Error:18.788
Error:18.557
Error:18.302
Error:18.160
Error:18.257


In [31]:
predict(f_y, oh_inputs_rnn[12])

('rtig har', ' ecedau ')

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         