Copyright (C) Egon Kidmose 2015-2017

This file is part of lstm-rnn-correlation.

lstm-rnn-correlation is free software: you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.

lstm-rnn-correlation is distributed in the hope that it will be
useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with lstm-rnn-correlation. If not, see
<http://www.gnu.org/licenses/>.


# Learning how to tie weights together
The purpose of this notebook is to learn/demonstrate how the weights of two layers can be tied together, such that the weights are always the same.

         IN
       /    \ 
     L1      L2
     |        |
    OUT1  == OUT2

In [None]:
from __future__ import print_function

import sys
import os
import time

import numpy as np
import theano
import theano.tensor as T

import lasagne
from lasagne.layers import *
from lasagne.nonlinearities import *

In [None]:
def load_dataset(
    ntrain=100,
    nval=100,
    ntest=100,
):
    def get_xors(n):
        inputs = np.random.randint(0, 2, (n, 2)).astype(bool)
        return inputs, inputs[:,0]^inputs[:,1]
    
    X_train, y_train = get_xors(ntrain)
    X_val, y_val = get_xors(nval)
    X_test, y_test = get_xors(ntest)
    return X_train, y_train, X_val, y_val, X_test, y_test

def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert len(inputs) == len(targets)
    if shuffle:
        indices = np.arange(len(inputs))
        np.random.shuffle(indices)
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets[excerpt]

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()

# For unit testing
# X_unit, y_unit, _, _, _, _ = load_dataset(2, min_digits=1, max_digits=1)
X_unit = [ 'abcdef', 'abcdef', 'qwerty']
X_unit = [[ord(c) for c in w] for w in X_unit]
X_unit = np.array(X_unit, dtype='int8')
print(X_unit)
n_alerts_unit, l_alerts_unit = X_unit.shape

# First line of network

In [None]:
input_var = T.imatrix('inputs')
target_var = T.imatrix('targets')

# Input layer
n_alerts = None
l_alerts = None
l_in = InputLayer(shape=(n_alerts, l_alerts), input_var=input_var, name='INPUT-LAYER') # 

# Test
pred_unit = get_output(l_in, inputs={l_in: input_var}).eval(
    {input_var: X_unit})
assert (pred_unit == X_unit).all(), "Unexpected output"

print(pred_unit)

In [None]:
# Embedding layer
n_alphabet = 2**7 # All ASCII chars

l_emb = EmbeddingLayer(l_in, n_alphabet, n_alphabet, 
                         W=np.eye(n_alphabet,dtype='int8'),
                         name='EMBEDDING-LAYER')
l_emb.params[l_emb.W].remove('trainable') # Fix weight

# Test
pred_unit = get_output(l_emb, inputs={l_in: input_var}).eval(
    {input_var: X_unit})
assert (np.argmax(pred_unit, axis=2) == X_unit).all()
assert np.all(pred_unit.shape == (n_alerts_unit, l_alerts_unit, n_alphabet ))
print(pred_unit.shape)
print(pred_unit)


In [None]:
# Recurrent LSTM layer
num_units = 10
l_lstm = LSTMLayer(l_emb, num_units=num_units, name='LSTM-LAYER')
pred_unit = get_output(l_lstm, inputs={l_in: input_var}).eval({input_var: X_unit})
assert pred_unit.shape == (n_alerts_unit, l_alerts_unit, num_units), "Unexpected dimensions"

# Test
pred_unit = get_output(l_lstm, inputs={l_in: input_var}).eval({input_var: [[1],[1]]})
assert np.all(pred_unit[0] == pred_unit[1]), "Repeated alerts must produce the same"

pred_unit = get_output(l_lstm, inputs={l_in: input_var}).eval({input_var: [[1,1],[1,1]]})
assert np.all(pred_unit[0] == pred_unit[1]), "Repeated alerts must produce the same"

pred_unit = get_output(l_lstm, inputs={l_in: input_var}).eval({input_var: [[1,1],[0,1]]})
assert np.all(pred_unit[0] != pred_unit[1]), "Earlier must affect laters"

pred_unit = get_output(l_lstm, inputs={l_in: input_var}).eval({input_var: [[1,0],[1,1]]})
assert np.all(pred_unit[0,0] == pred_unit[1,0]), "Later must not affect earlier"
assert np.all(pred_unit[0,1] != pred_unit[1,1]), "Current must make a difference"

net1 = l_lstm

# Clone line, tie weights

In [None]:
# Create an identical test network, with tied weights
net2 = None
for l in get_all_layers(net1):
    print("{} ({}):".format(l.name, l))
    if isinstance(l, InputLayer):
        net2 = InputLayer(
            shape=l.shape, 
            input_var=l.input_var, 
            name=l.name+'2',
        )
    elif isinstance(l, DenseLayer):
        net2 = DenseLayer(
            net2,
            num_units=l.num_units,
            W=l.W,
            b=l.b,
            nonlinearity=l.nonlinearity,
            name=l.name+'2',
        )
    elif isinstance(l, EmbeddingLayer):
        net2 = EmbeddingLayer(
            net2,
            l.input_size,
            l.output_size,
            W=l.W,
            name=l.name+'2',
        )
    elif isinstance(l, LSTMLayer):
        net2 = LSTMLayer(
            net2,
            l.num_units,
            ingate=Gate(W_in=l.W_in_to_ingate, W_hid=l.W_hid_to_ingate, W_cell=l.W_cell_to_ingate, b=l.b_ingate, nonlinearity=l.nonlinearity_ingate),
            forgetgate=Gate(W_in=l.W_in_to_forgetgate, W_hid=l.W_hid_to_forgetgate, W_cell=l.W_cell_to_forgetgate, b=l.b_forgetgate, nonlinearity=l.nonlinearity_forgetgate),
            cell=Gate(W_in=l.W_in_to_cell, W_hid=l.W_hid_to_cell, W_cell=None, b=l.b_cell, nonlinearity=l.nonlinearity_cell),
            outgate=Gate(W_in=l.W_in_to_outgate, W_hid=l.W_hid_to_outgate, W_cell=l.W_cell_to_outgate, b=l.b_outgate, nonlinearity=l.nonlinearity_outgate),
            nonlinearity=l.nonlinearity,
            cell_init=l.cell_init,
            hid_init=l.hid_init,
            backwards=l.backwards,
            learn_init=l.learn_init,
            peepholes=l.peepholes,
            gradient_steps=l.gradient_steps,
            grad_clipping=l.grad_clipping,
            unroll_scan=l.unroll_scan,
            precompute_input=l.precompute_input,
            # mask_input=l.mask_input, # AttributeError: 'LSTMLayer' object has no attribute 'mask_input'
            name=l.name+'2',
        )
    else:
        raise ValueError("Unhandled layer: {}".format(l))
    print(' - added layer: {} ({})'.format(get_all_layers(net2)[-1], get_all_layers(net2)[-1].name))

# Test
pred_unit1 = get_output(net1, inputs={net1: input_var}).eval({input_var: X_unit})
pred_unit2 = get_output(net2, inputs={net2: input_var}).eval({input_var: X_unit})
assert np.all(pred_unit == pred_unit), "The two lines must output the same"

In [None]:
# Training
prediction = lasagne.layers.get_output(net)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
loss = loss.mean()
params = lasagne.layers.get_all_params(net, trainable=True)
updates = lasagne.updates.sgd(loss, params, learning_rate=0.1)

# Testing
test_prediction = lasagne.layers.get_output(test_net, deterministic=True)
test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                        target_var)
test_loss = test_loss.mean()
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                  dtype=theano.config.floatX)

train_fn = theano.function([input_var, target_var], loss, updates=updates)
val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

In [None]:
print("Starting training...")
X_train, y_train, X_val, y_val, X_test, y_test = load_dataset(100,100,100)
num_epochs = 5000
for epoch in range(num_epochs):
    train_err = 0
    train_batches = 0
    start_time = time.time()
    for batch in iterate_minibatches(X_train, y_train, 100, shuffle=True):
        inputs, targets = batch
        train_err += train_fn(inputs, targets)
        train_batches += 1

    val_err = 0
    val_acc = 0
    val_batches = 0
    for batch in iterate_minibatches(X_val, y_val, 100, shuffle=False):
        inputs, targets = batch
        err, acc = val_fn(inputs, targets)
        val_err += err
        val_acc += acc
        val_batches += 1

    """print("Epoch {} of {} took {:.3f}s".format(
        epoch + 1, num_epochs, time.time() - start_time))
    print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
    print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
    print("  validation accuracy:\t\t{:.2f} %".format(
        val_acc / val_batches * 100))"""

test_err = 0
test_acc = 0
test_batches = 0
for batch in iterate_minibatches(X_test, y_test, 100, shuffle=False):
    inputs, targets = batch
    err, acc = val_fn(inputs, targets)
    test_err += err
    test_acc += acc
    test_batches += 1
print("Final results:")
print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
print("  test accuracy:\t\t{:.2f} %".format(
    test_acc / test_batches * 100))

In [None]:
np.set_printoptions(precision=1) 
print('trained network paramters:')
for l in get_all_layers(net):
    print(l)
    print(' {}'.format(l.name))
    for p in l.get_params():
        print(' {}'.format(p))
        print('  {}'.format(p.get_value()))


print()

print('test network paramters:')
for l in get_all_layers(test_net):
    print(l)
    print(' {}'.format(l.name))
    for p in l.get_params():
        print(' {}'.format(p))
        print('  {}'.format(p.get_value()))

