# Train the chatbot

This notebook replicates this [article](http://arxiv.org/abs/1503.01007)

In [1]:
import os
import json

import numpy as np

import pandas as pd
import tqdm

We'll definitely need to plot something

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

Uninterruptible section

In [3]:
import signal

class DelayedKeyboardInterrupt(object):
    def __enter__(self):
        self.signal_received = False
        self.old_handler = signal.getsignal(signal.SIGINT)
        signal.signal(signal.SIGINT, self.handler)

    def handler(self, sig, frame):
        self.signal_received = (sig, frame)

    def __exit__(self, type, value, traceback):
        signal.signal(signal.SIGINT, self.old_handler)
        if self.signal_received:
            self.old_handler(*self.signal_received)

Fix the random seed

In [4]:
random_state = np.random.RandomState(0x0BADC0DE)

Import Theano and Lasagne

In [5]:
# %env THEANO_FLAGS='device=cuda0,force_device=True,mode=FAST_RUN,floatX=float32'

import theano
theano.config.exception_verbosity = 'high'

import theano.tensor as tt

import lasagne
from lasagne.utils import floatX

Using cuDNN version 5103 on context None
Mapped name None to device cuda: GeForce GTX 980 Ti (0000:06:00.0)


Fix Lasagne's random seed.

In [6]:
lasagne.random.set_rng(np.random.RandomState(0xDEADC0DE))

Build the vocabulary

In [7]:
## It is very important that these service characters be added first
vocab = [">", "<", "a", "b", "c"]
token_to_index = {w: i for i, w in enumerate(vocab)}

A function to lines into character id vectors.

In [8]:
def as_matrix(lines, max_len=None):
    if isinstance(lines, str):
        lines = [lines]

    max_len = max_len or max(map(len, lines))
    matrix = np.full((len(lines), max_len), -1, dtype='int32')

    for i, line in enumerate(lines):
        row_ix = [token_to_index.get(c, -1)
                  for c in line[:max_len]]

        matrix[i, :len(row_ix)] = row_ix

    return matrix

A function to sample a batch from History-Reply pairs

In [9]:
def generate_batch(batch_size=32, max_seq_len=None):
    sequences = []
    for i in range(batch_size):
        n, m = random_state.randint(1, 10, size=(2,))
        sequences.append(">" + "a" * n + "b" * m + "c" * (n + m) + "<")
    return as_matrix(sequences, max_seq_len)

Define a simple seq2seq network (preferably in Lasagne)

In [10]:
from lasagne.layers import InputLayer, EmbeddingLayer
from lasagne.layers import DenseLayer
from lasagne.layers import NonlinearityLayer
from lasagne.layers import SliceLayer

from GRUwithStack import GRUStackLayer, GRUStackReadoutLayer
from broadcast import BroadcastLayer, UnbroadcastLayer

The architecture hyper parameters

In [11]:
n_embed_char, n_hidden_decoder = len(vocab), 32
n_stack_depth, n_stack_width = 50, 3
n_recurrent_layers = 1

### Embedding subgraph (pinkish)

The common embedding layer

In [12]:
l_input_char = InputLayer((None, None), name="char/input")
l_embed_char = EmbeddingLayer(l_input_char, len(vocab), n_embed_char, name="char/embed")

### Encoder-decoder

Tap into the common embedding layer but with decoder's own input.

In [13]:
v_input_decoder = tt.imatrix(name="decoder/input")
v_embed_decoder = lasagne.layers.get_output(l_embed_char, v_input_decoder)

l_decoder_mask = InputLayer((None, None), name="decoder/mask", input_var=tt.ge(v_input_decoder, 0))
l_decoder_embed = InputLayer((None, None, n_embed_char), name="decoder/input", input_var=v_embed_decoder)

Construct layers of GRU-s which recieve the final state of the encoder's network.

In [14]:
dec_layers = [l_decoder_embed]
dec_layers_stack = [None]
for layer_num in range(n_recurrent_layers):
    name = os.path.join("decoder", "GRU_%02d" % layer_num)

    v_stack_input = tt.zeros((v_input_decoder.shape[0], n_stack_depth, n_stack_width),
                             dtype=theano.config.floatX)

    l_stack_input = InputLayer((None, n_stack_depth, n_stack_width),
                               input_var=v_stack_input,
                               name=os.path.join(name, "stack"))

    v_hid_input = tt.zeros((v_input_decoder.shape[0], n_hidden_decoder),
                           dtype=theano.config.floatX)

    l_hid_input = InputLayer((None, n_hidden_decoder),
                             input_var=v_hid_input,
                             name=os.path.join(name, "hidden"))

    gru_layer = GRUStackLayer(dec_layers[-1], l_stack_input, n_hidden_decoder,
                              hid_init=l_hid_input, mask_input=l_decoder_mask,
                              backwards=False, learn_init=False, name=name)

    dec_layers.append(gru_layer)
    dec_layers_stack.append(
        SliceLayer(GRUStackReadoutLayer(gru_layer,
                                        name=os.path.join(name, "stack_readout")),
                   -1, axis=1))

<img src="http://s32.postimg.org/vefrk7vqt/stack_gru.png"/>

Read the output of the top layer of the RNN and re-embed into the character space

In [15]:
l_decoder_reembedder = DenseLayer(dec_layers[-1], num_units=len(vocab),
                                  nonlinearity=None, num_leading_axes=2,
                                  name="decoder/project")

l_bc = BroadcastLayer(l_decoder_reembedder, broadcasted_axes=(0, 1), name="decoder/bc")
l_softmax = NonlinearityLayer(l_bc, nonlinearity=lasagne.nonlinearities.softmax, name="decoder/softmax")
l_decoder_output = UnbroadcastLayer(l_softmax, l_bc, name="decoder/ub")

### Loss

Get the output of the decoder

In [16]:
v_decoder_output, v_decoder_mask = lasagne.layers.get_output(
    [l_decoder_output, l_decoder_mask])

Slice the output to match the forward character-level language model

In [17]:
predicted = v_decoder_output[:, :-1].reshape((-1, v_decoder_output.shape[-1]))
targets = v_input_decoder[:, 1:].reshape((-1,))
mask = v_decoder_mask[:, 1:].reshape((-1,))

Construct the cross-entropy loss

In [18]:
loss = (lasagne.objectives.categorical_crossentropy(predicted, targets) * mask).sum() / mask.sum()

Add $l^2$ regularization.

In [19]:
# from lasagne.regularization import regularize_network_params, l2

# reg_l2 = regularize_network_params(l_decoder_output, l2) * 10e-5

# loss += reg_l2

Collect all trainable parameters 

In [20]:
trainable = []
trainable.extend(lasagne.layers.get_all_params(l_embed_char, trainable=True))
trainable.extend(lasagne.layers.get_all_params(l_decoder_output, trainable=True))

Get the updates

In [21]:
learning_rate = theano.shared(floatX(1e-3), name="eta")
updates = lasagne.updates.adam(loss, trainable, learning_rate)

Create the ops

In [22]:
op_train = theano.function([v_input_decoder], loss,
                           updates=updates, givens={},
                           mode=theano.Mode(optimizer="fast_run"))

  "flatten outdim parameter is deprecated, use ndim instead.")


In [23]:
op_test_loss = theano.function([v_input_decoder], loss,
                               mode=theano.Mode(optimizer="fast_run"))

In [24]:
op_predict = theano.function([v_input_decoder], predicted,
                               mode=theano.Mode(optimizer="fast_run"))

### The generator

Define one step of the scan function

In [25]:
def gen_step(x_tm1, h_tm1, m_tm1, s_tm1, tau, eps):
    """One step of the generative decoder version."""
    # x_tm1 is `batch` int8, h_tm1 is `batch x ...`
    # m_tm1 is `batch`, tau, eps are scalars

    # A handy slicer (copied and modified)
    def slice_(x, i, n):
        s = x[..., slice(i, i + n)]
        return s if n > 1 else tt.addbroadcast(s, -1)

    # embed the previous character. x_t is `batch x embed`
    x_t = l_embed_char.get_output_for(x_tm1, deterministic=True)

    # collect the inputs and hidden init feeds
    j = 0
    inputs = {l_decoder_embed: x_t.dimshuffle(0, "x", 1),
              l_decoder_mask: m_tm1.dimshuffle(0, "x")}
    for layer in dec_layers[1:]:
        inputs[layer.hid_init] = slice_(h_tm1, j, layer.num_units)
        j += layer.num_units

    j = 0
    for layer in dec_layers[1:]:  # enc_layers_stack[1:]
        layer = layer.input_layers[1]
        dep, wid = layer.output_shape[-2:]
        stack_slice_ = slice_(s_tm1, j, dep * wid)
        inputs[layer] = stack_slice_.reshape((-1, dep, wid))
        j += dep * wid

    outputs = [l_decoder_reembedder]
    for h, s in zip(dec_layers[1:], dec_layers_stack[1:]):
        outputs.append(h)
        outputs.append(s)

    # propagate through the decoder column
    logit_t, *rest = lasagne.layers.get_output(outputs, inputs, deterministic=True)
    h_t_list, s_t_list = rest[::2], rest[1::2]

    logit_t = logit_t[:, 0]
    prob_t = tt.nnet.softmax(logit_t)

    # Gumbel-softmax sampling
    do_not_sample = tt.le(tau, 0)
    gumbel = theano.ifelse.ifelse(
        do_not_sample, tt.zeros_like(logit_t),
        -tt.log(-tt.log(theano_random_state.uniform(size=logit_t.shape) + eps) + eps))

    # Add Gumbel (e^{-e^{-x}}) distributed random noise
    logit_t = (gumbel + logit_t) / theano.ifelse.ifelse(do_not_sample, 1.0, tau)

    # Pick one element
    x_t = tt.argmax(
        theano.ifelse.ifelse(do_not_sample, prob_t,
                             tt.nnet.softmax(logit_t)),
        axis=-1)
    
#     # Gumbel-softmax sampling: Gumbel (e^{-e^{-x}}) distributed random noise
#     gumbel = -tt.log(-tt.log(theano_random_state.uniform(size=logit_t.shape) + eps) + eps)
#     logit_t = tt.switch(tt.gt(tau, 0), (gumbel + logit_t) / tau, logit_t)

#     # Pick one element
#     x_t = tt.argmax(tt.switch(tt.gt(tau, 0),
#                               tt.nnet.softmax(logit_t),
#                               prob_t), axis=-1)

    # stop generation if a stop symbol was picked.
    m_t = m_tm1 & tt.gt(x_t, vocab.index("<"))

    # Concatenate the hidden states, freezing them if necessary
    h_t = tt.concatenate([v[:, 0] for v in h_t_list], axis=-1)
    h_t = tt.switch(m_t.dimshuffle(0, 'x'), h_t, h_tm1)

    # flatten and concatenate the stack state
    s_t = tt.concatenate([v.flatten(ndim=2) for v in s_t_list], axis=-1)
    s_t = tt.switch(m_t.dimshuffle(0, 'x'), s_t, s_tm1)

    # Propagate the stop token
    x_t = tt.cast(tt.switch(m_t, x_t, vocab.index("<")), "int8")
    p_t = prob_t[tt.arange(x_t.shape[0]), x_t]

    return x_t, h_t, m_t, s_t, p_t

In [26]:
# x_t, h_t, m_t, p_t = op_generate(1, 120, 1e-4)

# x_t[0]

# m_t[0]

# vocab

Create scalar inputs to the scan loop. Also initialize the random stream.

In [27]:
theano_random_state = tt.shared_randomstreams.RandomStreams(seed=42)

eps = tt.fscalar("generator/epsilon")
n_steps = tt.iscalar("generator/n_steps")
tau = tt.fscalar("generator/gumbel/tau")

n_batch = tt.iscalar("generator/n_batch")

Readout the last state of the stack (per dialogue in the batch) from the encoder.

In [28]:
enc_stack_inits = [tt.zeros((n_batch, n_stack_depth, n_stack_width)) for l in dec_layers[1:]]
enc_hid_inits = [tt.zeros((n_batch, n_hidden_decoder)) for l in dec_layers[1:]]

Prepare the initial values.

In [29]:
x_0 = tt.fill(tt.zeros((n_batch, ), dtype="int32"), vocab.index(">"))
h_0 = tt.concatenate(enc_hid_inits, axis=-1)
s_0 = tt.concatenate([v.flatten(ndim=2) for v in enc_stack_inits], axis=-1)
m_0 = tt.ones_like(x_0, 'bool')

Add a scan op and compile

In [30]:
result, updates = theano.scan(gen_step, sequences=None, n_steps=n_steps,
                              outputs_info=[x_0, h_0, m_0, s_0, None], return_list=True,
                              non_sequences=[tau, eps], strict=False,
                              go_backwards=False, name="generator/scan")
x_t, h_t, m_t, s_t, p_t = [r.swapaxes(0, 1) for r in result]

compile_mode = theano.Mode(optimizer="fast_run", linker="cvm")
op_generate = theano.function([n_batch, n_steps, tau], [x_t, h_t, m_t, p_t],
                              updates=updates, givens={eps: floatX(1e-20)},
                              mode=compile_mode)

A generator procedure, which automatically select the best replies (shortest).

In [31]:
def generate(n_batches, n_steps, tau=0):
    x_t, h_t, m_t, p_t = op_generate(n_batches, n_steps, tau)

    perplexity = (- np.log2(p_t) * m_t).sum(axis=-1)
    perplexity /= m_t.sum(axis=-1)
    order = perplexity.argsort()
    x_t, perplexity = x_t[order], perplexity[order]

    result = []
    for reply in x_t:
        reply_ = "".join([vocab[i] for i in reply])
        result.append(reply_.replace("<", ""))

    return result

### Train the shit

In [32]:
def sample_qa():
    replies = generate(10, 140, tau=1e-20)
    tqdm.tqdm.write(", ".join(replies))

Reset the history

In [33]:
epoch, loss_val_hist = 0, []

Now let's train the shit!

In [34]:
batch_size, n_epochs = 128, 5000
progress_fmt_ = "%(loss).3f"
with tqdm.tqdm(total=n_epochs-epoch) as progress_:
    while epoch < n_epochs:
        batch = generate_batch(batch_size, max_seq_len=512)
        with DelayedKeyboardInterrupt():
            loss_val_hist.append(op_train(batch))

        progress_.postfix = progress_fmt_ % {
            "loss": np.mean(loss_val_hist[-100:]),
        }
        progress_.update(1)
        if (epoch % 50) == 0:
            sample_qa()
        epoch += 1

  0%|          | 1/5000 [00:00<35:21,  2.36it/s, 1.612]

acbccc, ccbc, cc, cac, c, bcab, a, , , 


  1%|          | 51/5000 [00:21<34:45,  2.37it/s, 1.378]

bbc, c, c, c, b, , , , , 


  2%|▏         | 101/5000 [00:42<34:16,  2.38it/s, 1.197]

bbbcbbcc, a, ab, abaaa, cb, , , , , 


  3%|▎         | 151/5000 [01:04<33:56,  2.38it/s, 0.884]

acaaaaabbbcbcccccccc, aa, aaaacbbbcbc, bacabbc, bbabc, b, cba, cccc, , 


  4%|▍         | 201/5000 [01:25<33:30,  2.39it/s, 0.631]

aaa, aaaaaaabbbcbbcccccccccccccccccc, aaabaabbbbcccccccccccccccccccc, aabaabbbbbbccccccccccccccccccc, aaaaaabbbccbbcccccccccccccccc, ababbbbcccccccccccccccccccc, bbbbbcbccccc, abaabaabccbccccc, b, b


  5%|▌         | 251/5000 [01:46<33:19,  2.37it/s, 0.470]

aaaaaaabbbbcccccccccccc, aaaaaaabbbbbbccccccc, aaaaaaaabbbbccccc, aaaaabbbbbbbbbbcccccccccc, aabbbbbbcccccccccccc, aaaaaabbbbbbbbbbccccc, abbbbbbbbccccccc, aaaaababbbcccccccccc, bbbbbbbbccccc, baaaaaaacbbbbbbcccccccccccccccc


  6%|▌         | 301/5000 [02:07<32:53,  2.38it/s, 0.404]

aaaaaabbbbbbccccccccccccccc, aaaaaaaabbbbbcccccccccccccccc, aaaaabbbbbcccccccc, aaaaabbbbcccccccccc, aaaaaabbbcccccccccc, aaaaaaabbcccccccccccccc, aaaabbbbcccccccccccccc, aaabbbbccccccccc, baaabbccccccccccccccc, acaabbbcccccccccccccc


  7%|▋         | 351/5000 [02:28<32:39,  2.37it/s, 0.374]

aaaaabbbbbbbbbccccccccccccc, aaaaaaabbbcccccccccccc, aaabbbbbcccccccccccc, aaaabbbbbbbcccc, aabbbbbbbccccccc, aaaaaaaabccccccccccc, aaaaaaaaaaaabccccccccccc, aaaaaaacbbcccccccccccc, aaaaabbccccc, ccbac


  8%|▊         | 401/5000 [02:49<32:11,  2.38it/s, 0.353]

aa, aaaaaabbbbbbbbcccccccccccc, aaaaaaaaabbbbcccccccccc, aaaabbbbbbccccccccc, aaaaaabbbbbcccccc, aabbbbbbbbcccccccccc, aaaaaaaabbcccccccccccc, aaabbbbcccccccc, aaaaaaacbbbbbbbbccccccccccccc, aaaaaaaaaaaabccccccccc


  9%|▉         | 451/5000 [03:10<31:56,  2.37it/s, 0.324]

aaaaabbbbbccccccccccc, aaaaaaabbccccccccc, aaaaaaabbccccccccc, aaaabbbbbbbbcccccccccccccc, aaabbbbbbcccccccccc, aaaaaaaabbbcbccccccccccccc, aaababbbbcbbbbbccccccccccccc, aaaabaccccccc, bbbccccc, aabcbccccc


 10%|█         | 501/5000 [03:31<31:24,  2.39it/s, 0.291]

aaaaaabbbbbbbbbcccccccccccccc, aaaabbbbbbbbcccccccccccc, aaaaaaabbbcccccccccc, aaaabbbbbccccccccc, aaaaaaabbbccccc, aaaaaaabccccccc, aaaaabbcccccc, abbbbbbbbccccccccc, aabbbbbcccccccc, abbbbbcccccc


 11%|█         | 551/5000 [03:53<31:11,  2.38it/s, 0.268]

aaaaaaaabbbbbbbccccccccccccccc, aaaaaaabbbbbccccccccccccc, aaaaaaaabbbccccccccccc, aaaaaaabbbbccccccccc, aaaaabbbbccccccccc, aaaaaaabbccccccc, aabbbbbbcccccccc, aabbbbbcccccccc, aabbbbcccccc, aaaaaaaaabbbbbccccccbcccccccccc


 12%|█▏        | 601/5000 [04:14<30:48,  2.38it/s, 0.255]

aaaaaabbbbbbbbcccccccccccccc, aaaaabbbbbbbbcccccccccccccc, aaaaaabbbbcccccccccc, aaaaabbbcccccccc, abbbbbbbbbccccccccc, aaaaaaabcccccccc, aaaaaaaaaaabbbbbccccccccccccccccc, aaaaabbccccccc, aabbcccc, baaaacbbbcccccccccc


 13%|█▎        | 651/5000 [04:35<30:30,  2.38it/s, 0.244]

aaaaaaabbbbbbbbbcccccccccccccccc, aaaaaaaabbbbbbbbbccccccccccccccc, aaaaaaabbbbbcccccccccccc, aaaaabbbbbbbcccccccccccc, aaaaaabbbbbccccccccccc, aaabbbbbbbcccccccccc, aaaaaaaabccccccccc, abbbbbbbcccccccc, aaaaaaaaccbccccccccc, aabbccccc


 14%|█▍        | 701/5000 [04:56<29:58,  2.39it/s, 0.237]

aaaaaaabbbbbcccccccccccc, aaaaabbbbbbbcccccccccccc, aaaaaaaaaabbbbbccccccccccccccc, aaaabbbbbbcccccccccc, aabbbbbbcccccccc, abbbbbbbcccccccc, abbbbbbbcccccccc, aabbbbbccccccc, aaabbccccc, aaaabccaacccccc


 15%|█▌        | 751/5000 [05:17<29:46,  2.38it/s, 0.233]

aaaaaabbbbbbbbbccccccccccccccc, aaaabbbbbbbccccccccccc, aaaaabbbbbcccccccccc, aabbbbbbbbcccccccccc, aaaaabbbbccccccccc, abbbbbbbbccccccccc, aaaaaabbbbbcccccccccccc, aaabbbcccccc, aabbbccccc, abbbbccccc


 16%|█▌        | 801/5000 [05:38<29:27,  2.38it/s, 0.229]

aaaaaaaaabbbbbbccccccccccccccc, aaaaabbbbbbbcccccccccccc, aaaaaaaabbbbcccccccccccc, aaaaaaaabbbbccccccccccc, aaaaaaabbbbccccccccccc, aaaaaabbbbcccccccccc, aaaaaabbbbcccccccccc, aaabbbbbbbbbbccccccccccccc, aaaaaaaaccccccccc, 


 17%|█▋        | 851/5000 [05:59<29:05,  2.38it/s, 0.226]

aaaaaaabbbbbbbbccccccccccccccc, aaaaaaabbbbbbccccccccccccc, aaaaaaabbbbbbccccccccccccc, aaaaabbbbbbbcccccccccccc, aaabbbbbbbbccccccccccc, aaaaaabbbbcccccccccc, aaaaaaaaacccccccccc, aaaaaaaaaaacccccccccccc, aaaaaabbbbcbccccccccccc, aabbcc


 18%|█▊        | 901/5000 [06:21<28:44,  2.38it/s, 0.225]

aaaaabbbbbbbbbcccccccccccccc, aaaaaaaabbcccccccccc, aaabbbbbbbbbbccccccccccccc, aaaaaaaaabcccccccccc, aabbbbbbbbbbcccccccccccc, abbbbbbbbccccccccc, aaaaaaaabbbbbcbccccccccccccc, aaaaaabcccccc, aaabbcccc, abbcccc


 19%|█▉        | 951/5000 [06:42<28:22,  2.38it/s, 0.224]

aaaaaabbbbbbbccccccccccccc, aaabbbbbbbbccccccccccc, aaabbbbbbccccccccc, aaaaaaaaabbbbbbbccccccccccccccccc, abbbbbbbbbcccccccccc, aaaaabbccccccc, aabbbccccc, aabbbccccc, aabbcccc, abbcccc


 20%|██        | 1001/5000 [07:03<27:59,  2.38it/s, 0.221]

aaaaaaabbbbbbbcccccccccccccc, aaaaaaabbbbbcccccccccccc, aaaabbbbbbbbbccccccccccccc, aabbbbbbbbcccccccccc, aaaaaaabbccccccccc, aaaaaaabbccccccccc, aaaaaaaaaabbcccccccccccc, aaabbbbbcccccccc, aaaaaabccccccc, aaabbccccc


 21%|██        | 1051/5000 [07:24<27:40,  2.38it/s, 0.221]

aabbbbbbbbcccccccccc, aaaaaabbbbcccccccccc, aaabbbbbbccccccccc, aaaaaaaaaabbcccccccccccc, aaaaaaabcccccccc, abbbbbbbbbccccccccc, aaaabbcccccc, aaabbbcccccc, abbbcccc, abbbcccc


 22%|██▏       | 1101/5000 [07:45<27:22,  2.37it/s, 0.227]

aaaaaaabbbbbbbcccccccccccccc, aaaaabbbbbbbbccccccccccccc, aaaaaaaabbbbcccccccccccc, aaaaaabbbbbccccccccccc, aaabbbbbbbbbcccccccccccc, aaaabbbbbbbbbbcccccccccccccc, abbbbbbbccccccc, aaaabbbccccccc, abbbbbcccccc, abbbcccc


 23%|██▎       | 1151/5000 [08:07<27:05,  2.37it/s, 0.231]

aaaaaabbbbbbbbcccccccccccccc, aaaaaaabbbbbbccccccccccccc, aaaaaaabbbbbbccccccccccccc, aaaaaaaabbbbcccccccccccc, aaaaaaaaaabbbbbbcccccccccccccccc, aaaaaabbbbbccccccccccc, aaaaabbbbbcccccccccc, aaaaaaabbccccccccc, aaaaaaaaabcccccccccc, aabbbccccc


 24%|██▍       | 1201/5000 [08:28<26:33,  2.38it/s, 0.224]

aaaaaaaaabbbbbbccccccccccccccc, aaaaaaaabbbbccccccccccc, aaaaaabbbbbccccccccccc, aaaaaaaaabcccccccccc, abbbbbbbbccccccccc, aaaaaabbbccccccccc, aaaaaaabcccccccc, aaaaaaabcccccccc, aaabbbcccccc, abbbcccc





KeyboardInterrupt: 

In [36]:
generate(10, 150, tau=1e-3)

['aaaaaaaabbbbbbbccccccccccccccc',
 'aaaaaaabbbbbbbbccccccccccccccc',
 'aaaaaaaaabbbbbcccccccccccccc',
 'aaaaabbbbbbbcccccccccccc',
 'aaaaaaabbbbbcccccccccccc',
 'aaabbbbbbbbbcccccccccccc',
 'aaaaaaaaabbccccccccccc',
 'aabbbbbbbccccccccc',
 'aabbbbbccccccc',
 'abbbbccccc']

In [None]:
# generate(as_matrix(["\x02" + "First, have theano installed." + "\x03"]), 75, tau=1e-5, n_samples=200)

Save the trained model

In [None]:
import time
import pickle

weights = {
    "l_embed_char": lasagne.layers.get_all_param_values(l_embed_char),
    "l_decoder_reembedder": lasagne.layers.get_all_param_values(l_decoder_reembedder)
}

filename = "trained_MyFirstBotMK2_with-stack_model_%s.pkl" % time.strftime("%Y%m%d-%H%M%S")
with open(filename, "wb") as fin:
    pickle.dump(("1.0", vocab, weights), fin)

In [None]:
# list(zip(trainable[1:], weights["l_decoder_reembedder"]))