In [398]:
#nn-utilities

import numpy as np
import theano.tensor as T
import theano
from collections import OrderedDict

def numpy_floatX(data):
    return numpy.asarray(data, dtype=config.floatX)

#tparams is dictionary to theano variables
#pars is strings of parameters to include
#R -> Parameters -> [String] -> R
def weight_decay(decay_c, tparams, pars):
    tdecay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
    total = 0.
    #do this in a for loop because the parameters may be different dimensions - so it's awkward to concatenate.
    for name in pars:
        total += (tparams[name] ** 2).sum()
    return decay_c * total

# Int -> Int -> Bool -> [(Int, [Int])], enumerated list of minibatch indices.
def get_minibatches_idx(n, minibatch_size, shuffle=False):
    """
    Used to shuffle the dataset at each iteration.
    """
    #idx_list = [0..(n-1)]
    idx_list = np.arange(n, dtype="int32")
    
    if shuffle:
        np.random.shuffle(idx_list)

    minibatches = []
    minibatch_start = 0
    for i in range(n // minibatch_size):
        minibatches.append(idx_list[minibatch_start:
                                    minibatch_start + minibatch_size])
        minibatch_start += minibatch_size

    if (minibatch_start != n):
        # Make a minibatch out of what is left
        minibatches.append(idx_list[minibatch_start:])

    return zip(range(len(minibatches)), minibatches)

#ERROR: add this
def init_tparams(params):
    tparams = OrderedDict()
    for kk, pp in params.iteritems():
        tparams[kk] = theano.shared(params[kk], name=kk)
    return tparams

# Dict String a -> Dict String (Theano a)
def wrap_theano_dict(params, tparams=None):
    """
    When we reload the model. Needed for the GPU stuff.
    """
    if tparams==None: 
        #if no pointer to a dictionary given, create one
        #TODO: initialize theano variables!
        tparams = OrderedDict()
    for kk, vv in params.iteritems():
        tparams[kk].set_value(vv)
    return tparams

# Dict String (Theano a) -> Dict String a
def unwrap_theano_dict(zipped):
    """
    When we pickle the model. Needed for the GPU stuff.
    """
    new_params = OrderedDict()
    for kk, vv in zipped.iteritems():
        new_params[kk] = vv.get_value()
    return new_params

def hot(choices, n):
    return [n==x for x in range(choices)]

#Int -> c:Int -> R^c
def oneHot(choices, n):
    #return [T.eq(n,x) for x in range(choices)]
    return T.as_tensor_variable([T.eq(n,x) for x in range(choices)])

#?
def zipp(params, tparams):
    """
    When we reload the model. Needed for the GPU stuff.
    """
    for kk, vv in params.iteritems():
        tparams[kk].set_value(vv)


def mapped_oneHot(choices, ns):
    return tmap(lambda x: oneHot(choices,x), ns)

def mapped_mapped_oneHot(choices, nss):
    return tmap2(lambda x: oneHot(choices,x), nss)

def tmap(f, n, fixed=[]):
    x, _ = theano.map(f, n, non_sequences=fixed)
    return x

def tmap2(f,n, fixed=[]):
    return tmap(lambda x: tmap(f, x, non_sequences=fixed), n)


In [399]:
#utilites

import itertools

def case(var, li, else_expr=None):
    for (val, expr) in li:
        if val:
            return expr
    return else_expr

#alternatively use the ternary control operator
#    a if test else b
#http://stackoverflow.com/questions/394809/does-python-have-a-ternary-conditional-operator
#WARNING: THIS IS NOT LAZY
def if_f(expr, t, f):
    if expr:
        return t
    else:
        return f

def ifs(li, else_expr=None):
    for (stmt, val) in li:
        if stmt:
            return val
    return else_expr

def concat(lis):
    return itertools.chain(*lis)

def union(*dicts):
    return dict(sum(map(lambda dct: list(dct.items()), dicts), []))


In [400]:
#optimizers

from collections import OrderedDict
import cPickle as pkl
import sys
import time

import numpy as np
import theano
from theano import config
import theano.tensor as tensor
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

#import utilities
#import nn_utilities

"""
Single steps for various optimizers
Each optimizer returns two (compiled) theano functions (with updates)
# f_grad_shared: calculates the cost, and updates its own parameters
# f_updates: update the neural net weights
Note cost is a Theano variable, not a compiled function.
"""

def sgd(lr, tparams, grads, cost, args):
    """ Stochastic Gradient Descent

    :note: A more complicated version of sgd then needed.  This is
        done like that for adadelta and rmsprop.

    """
    # New set of shared variable that will contain the gradient
    # for a mini-batch.
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
               for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    #zip(gshared,grads)

    # Function that computes gradients for a mini-batch, but do not
    # updates the weights.
    f_grad_shared = theano.function(args, cost, updates=gsup,
                                    name='sgd_f_grad_shared')

    pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]

    # Function that updates the weights from the previously computed
    # gradient.
    f_update = theano.function([lr], [], updates=pup,
                               name='sgd_f_update')

    return f_grad_shared, f_update

def adadelta(lr, tparams, grads, cost, args):
    """
    An adaptive learning rate optimizer

    Parameters
    ----------
    lr : Theano SharedVariable
        Initial learning rate
    tpramas: Theano SharedVariable
        Model parameters
    grads: Theano variable
        Gradients of cost w.r.t to parameres
    x: Theano variable
        Model inputs
    mask: Theano variable
        Sequence mask
    y: Theano variable
        Targets
    cost: Theano variable
        Objective fucntion to minimize

    Notes
    -----
    For more information, see [ADADELTA]_.

    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
       Rate Method*, arXiv:1212.5701.
    """

    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.iteritems()]
    running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                 name='%s_rup2' % k)
                   for k, p in tparams.iteritems()]
    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(args, cost, updates=zgup + rg2up,
                                    name='adadelta_f_grad_shared')

    updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
             for zg, ru2, rg2 in zip(zipped_grads,
                                     running_up2,
                                     running_grads2)]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
             for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]

    f_update = theano.function([lr], [], updates=ru2up + param_up,
                               on_unused_input='ignore',
                               name='adadelta_f_update')

    return f_grad_shared, f_update


def rmsprop(lr, tparams, grads, cost, args):
    """
    A variant of  SGD that scales the step size by running average of the
    recent step norms.

    Parameters
    ----------
    lr : Theano SharedVariable
        Initial learning rate
    tpramas: Theano SharedVariable
        Model parameters
    grads: Theano variable
        Gradients of cost w.r.t to parameres
    x: Theano variable
        Model inputs
    mask: Theano variable
        Sequence mask
    y: Theano variable
        Targets
    cost: Theano variable
        Objective fucntion to minimize

    Notes
    -----
    For more information, see [Hint2014]_.

    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
       lecture 6a,
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
    """

    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.iteritems()]
    running_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                   name='%s_rgrad' % k)
                     for k, p in tparams.iteritems()]
    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(args, cost,
                                    updates=zgup + rgup + rg2up,
                                    name='rmsprop_f_grad_shared')

    updir = [theano.shared(p.get_value() * numpy_floatX(0.),
                           name='%s_updir' % k)
             for k, p in tparams.iteritems()]
    updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                            running_grads2)]
    param_up = [(p, p + udn[1])
                for p, udn in zip(tparams.values(), updir_new)]
    f_update = theano.function([lr], [], updates=updir_new + param_up,
                               on_unused_input='ignore',
                               name='rmsprop_f_update')

    return f_grad_shared, f_update

def train(
    init_params, # initial parameters (not in Theano)
    data_train, # : a (should be list of some sort)
    data_valid, # : a
    # data_test, # : a
    batch_maker, # : Int -> a -> [[b]] 
        #function that given the batch size and data, returns a list of list of batch identifiers (ex. Int)
    get_data_f, # : [[b]] -> (a -> train)
        #function that given a list of list of batch identifiers, gives a function that takes the data and gives training
    cost, # : (train -> Theano Float)
        #cost function
    pred_error,
    args,
    tparamss,
    patience=10,  # Number of epoch to wait before early stop if no progress
    max_epochs=5000,  # The maximum number of epoch to run,
    dispFreq=10,  # Display to stdout the training progress every N updates
    optimizer=rmsprop,
    saveto='model.npz',
    validFreq=370,  # Compute the validation error after this number of update.
    saveFreq=1110,  # Save the parameters after every saveFreq updates
    batch_size=16,  # The batch size during training.
    valid_batch_size=64,  # The batch size used for validation/test set.
):
    print 'Building model'

    ## initialize a theano variable dictionary with parameter values from init_params
    ## tparamss = [wrap_theano_dict(init_param) for init_param in init_params]
    ## careful of overlapping...

    # tparamss is a list of dictionaries (CHECK THIS)
    # ? Are these theano dicts?
    tparams = union(*tparamss)

    # ! use_noise is for dropout
    """(use_noise, x, mask,
     y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)"""
    
    # Get values of tparams (dictionary of all parameters)
    all_params=tparams.values()
    ## concat([tparams.values() for tparams in tparamss])

    # Compile the theano functions. 
    ## I don't think this commend is true - Note "args" contains information about the length of the vectors, so this effectively locks in the sequence length. IS THIS TRUE?
    # Compile the cost function.
    # cost : train -> Theano Float
    f_cost = theano.function(args, cost, name='f_cost')

    # Take the gradient, and compile that too.
    print("grad")
    print(cost)
    print(all_params)
    grads = theano.gradient.grad(cost, wrt=all_params) #ERROR
    f_grad = theano.function(args, grads, name='f_grad')

    #learning rate
    lr = tensor.scalar(name='lr')
    # The optimizer takes as arguments the learning rate, parameter dictionary (of theano vars), gradient and cost function, and arguments to those functions. All inputs are theano variables.
    # optimizer returns the f_grad_shared and the update function
    # ? what does f_grad_shared do?
    f_grad_shared, f_update = optimizer(lr, tparams, grads, cost, args)

    # Now starting optimization
    print 'Optimization'

    # The data can be in two forms ([a], [b]) or [a].
    # The length of ([a],[b]) or [a]
    def _len(li_or_pair):
        if type(li_or_pair)=="tuple":
            return len(li_or_pair[0])
        else:
            return len(li_or_pair)

    # If input is ([a],[b]), gives [a]; if input is [a] just gives [a]
    def get_first_if_tuple(maybe_tuple):
        if type(maybe_tuple)=="tuple":
            return maybe_tuple[0]
        else:
            return maybe_tuple

    # length of training data
    l_train = _len(data_train)
    l_valid = _len(data_valid)
    ## Ignore test right now.
    ## length of validation data
    ## l_test = _len(data_test)

    # batch_maker : a -> [[b]] 
    # function that given the data, returns a list of list of batch identifiers (ex. Int)
    # this is for the validation data. We run batch_maker on the training data inside the epoch loop.
    valid_batch_ids = batch_maker(valid_batch_size, data_valid)
    #test_batch_ids = batch_maker(data_test)

    # print the length of training data
    print "%d train examples" % l_train
    print "%d valid examples" % l_valid
    #print "%d test examples" % l_test
    
    # initialize history_errs, which will contain the validation errors from each time it checks the validation error.
    # best_p ?
    # bad count to 0.
    history_errs = []
    best_p = None
    bad_count = 0

    # if no validation frequency is give, validate once an epoch
    # (the length of an epoch is l_train / batch_size because each iteration ? takes batch_size samples.)
    if validFreq == -1:
        validFreq = l_train / batch_size
    #if no save frequency is give, validate once an epoch
    if saveFreq == -1:
        saveFreq = l_train / batch_size

    uidx = 0  # the number of updates done (increment by 1 every time we look at a batch and make an update)
    estop = False  # early stop
    start_time = time.time()
    try:
        #EPOCH LOOP
        #epoch index. (An epoch means going through the data once.)
        for eidx in range(max_epochs):
            # ?
            n_samples = 0
            
            # initialize epoch:
            # Call batch_maker to partition the training data into batches.
            # (ex. get the list of shuffled indices for the training set)
            ## kf = get_minibatches_idx(l_train, batch_size, shuffle=True)
            batch_ids = batch_maker(batch_size, data_train)
            print("batch_ids", batch_ids)
            #BATCH LOOP
            for batch_id in batch_ids: #ERROR
                # batch_id is a batch (note batch_maker has the indices, not the actual data). Call get_data_f to get the actual data.
                # increase number of updates done by 1
                uidx += 1
                ## use_noise.set_value(1.)
                
                # Select the random examples for this minibatch
                """
                if type(train)=="tuple":
                    inputs = map(lambda li: [li[t] for t in train_index], list(train))
                else:
                    #only 1 argument. also wrap up in single-element list for consistency.
                    inputs = [[train[t] for t in train_index]]
                n_samples += args[0].shape[0]
                """
                print("batch_id",batch_id)
                # get the batch 
                # [[b]] -> (a -> train)
                batch = get_data_f(data_train, batch_id)
                ## n_samples += batch[0].shape[0]
                # Expect batch to be a list or a tuple representing multiple arguments. If it's a single argument, wrap it in a list so we can use *batch to unpack the arguments.
                if not isinstance(batch, (list, tuple)):
                    batch = [batch]
                # Compute the cost 
                cost = f_grad_shared(*batch)
                # f_update updates the ? given the learning rate.
                f_update(lrate)

                # if the cost is infinite or undefined, stop.
                if np.isnan(cost) or np.isinf(cost):
                    print 'bad cost detected: ', cost
                    return 1., 1., 1.

                # Display if it's time to do so.
                if np.mod(uidx, dispFreq) == 0:
                    print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost

                # Save if it's time to do so.
                if saveto and np.mod(uidx, saveFreq) == 0:
                    print 'Saving...',
                    
                    # save the best parameters---not the current ones.
                    if best_p is not None:
                        params = best_p
                    else:
                        params = unwrap_theano_dict(tparams)
                    
                    # Save the arrays into "saveto" (which should be a .npz file).
                    # Save the history of errors.
                    # ? params are saved with the labels given by the dictionary.
                    np.savez(saveto, history_errs=history_errs, **params)
                    
                    ## Warning: python2 notation
                    # Dump the model options into [saveto].pkl.
                    pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                    print 'Done'

                # If it's time to validate
                if np.mod(uidx, validFreq) == 0:
                    ## use_noise.set_value(0.)
                    # For each batch identifier in batch,
                    # get that batch from data_train
                    # and calculate prediction error.
                    # Sum all these errors
                    train_err = sum([pred_error(get_data_f(data_train, batch_id)) for batch_id in batch])/data_train.size[0]
                    # Do the same for the validation error.
                    valid_err = sum([pred_error(get_data_f(data_valid, batch_id)) for batch_id in batch_valid])/data_valid.size[0]
                    ## test_err = sum([pred_error(get_data_f(data_test, batch_id)) for batch_id in batch])/data_test.size[0]
                    
                    # record the validtion error in the history.
                    history_errs.append(valid_err) #[valid_err, test_err])

                    # if the validation error is smaller than any seen so far
                    if (best_p is None or
                        valid_err <= np.array(history_errs)[:, 0].min()):
                        # then save the parameters to best_p
                        best_p = unzip(tparams)
                        # set bad_counter to 0.
                        bad_counter = 0
                    
                    # Show the training and validation error.
                    print ('Train ', train_err, 'Valid ', valid_err) #,
                           #'Test ', test_err)
                    
                    # If the current validation error is greater than the minimum validation error up to <patience> trials ago (why aren't we looking at the past few?), add 1 to bad_counter 
                    if (len(history_errs) > patience and
                        valid_err >= np.array(history_errs)[:-patience, 0].min()):
                        bad_counter += 1
                        #If this happens more times than patience allows, then signal that we stopped early and stop.
                        if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break
            
            # Show number of samples seen. 
            # ! This is currently 0.
            print 'Seen %d samples' % n_samples

            # If early stop activated, then stop.
            if estop:
                break

    except KeyboardInterrupt:
        print "Training interupted"

    end_time = time.time()
    if best_p is not None:
        # not sure what this does
        # Puts the best_p (best parameters) into a dictionary?
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

##    use_noise.set_value(0.)
    

    ## kf_train_sorted = get_minibatches_idx(_len(train), batch_size)
    # At the very end, calculate the training and validation error again.
    kf_train_sorted = batch_maker(batch_size, data_train)
    # Calculate training and validation error (note we also did this with [validFreq] frequency)
    train_err = sum([pred_error(get_data_f(data_train, batch_id)) for batch_id in batch])/data_train.size[0]
    valid_err = sum([pred_error(get_data_f(data_valid, batch_id)) for batch_id in batch_valid])/data_valid.size[0]

    print 'Train ', train_err, 'Valid ', valid_err #, 'Test ', test_err

    # Final save. Save the training error, validation error, history of errors, and the best parameters (unpacked)
    if saveto:
        np.savez(saveto, train_err=train_err,
                    valid_err=valid_err, #test_err=test_err,
                    history_errs=history_errs, **best_p)
    # How long the code took to run
    print 'The code run for %d epochs, with %f sec/epochs' % (
        (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))
    print >> sys.stderr, ('Training took %.1fs' %
                          (end_time - start_time))
    return train_err, valid_err #, test_err


In [401]:
import numpy as np
import theano
import theano.tensor as T
from theano.tensor.nnet import *
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from collections import OrderedDict

"""Parameters"""
def unpack_params(tparams, li):
    print("unpack", tparams, li, type(tparams),type(li))
    return [tparams[name] for name in li]

"""Basic NN's"""
def nn_layer1(x, W, b):
    return T.dot(x,W) + b #ERROR x*W+b

#R^m -> Parameters -> R^p, where W::R^{m x p} and b::R^p.
def nn_layer(x, tparams):
    W, b = unpack_params(tparams, ["W", "b"])
    return nn_layer1(x, W, b)

#R^k -> R^k -> R
def logloss(pred, actual):
    #sum on innermost axis.
    return -(actual * corrected_log(pred)).sum(axis=-1)
#CHECK THAT THIS MAPS

#Warning: this doesn't map.
#R^k -> Nat -> R
def logloss_i(pred, actual_i):
    return -corrected_log(pred[actual_i])

#R -> R
def corrected_log(x):
    return T.log(T.maximum(1e-6,x)) #ERROR: maximum, not max

#return dictionary of parameters (with default random initialization)
def init_params_nn(n, m, init='zeros'):
    rand_f = lambda l1, l2: np.asarray(np.random.normal(0, 1/np.sqrt(n), (l1,l2)))
    (f, g) = case(init,
                [('zeros', (np.zeros, lambda c, r: np.zeros((c, r)))),
                 ('rand', (np.zeros, rand_f))])
    return init_params_with_f_nn(n,m,f,g)

#returns a dictionary of parameters, initialized using the functions f and g.
def init_params_with_f_nn(n,m,f,g):
    pairs = [("W",np.asarray(g(n, m))),
             ("b",np.asarray(f(m)))]
    """
    pairs = [("W",T.as_tensor_variable(np.asarray(g(n, m)))),
             ("b",T.as_tensor_variable(np.asarray(f(m))))]"""
    return OrderedDict(pairs)


"""LSTM functions"""
#x, C, h are the inputs, and C1, h1 as the outputs.
#the rest are weight vectors.
def step_lstm1(x, C, h, Wf, bf, Wi, bi, WC, bC, Wo, bo):
    print(h)
    print(x)
    hx = T.concatenate([h,x],axis=-1) #dimension m+n
    f = sigmoid(T.dot(hx, Wf) + bf) #dimension m
    i = sigmoid(T.dot(hx, Wi) + bi) #dimension m
    C_add = T.tanh(T.dot(hx, WC) + bC) #dimension m
    C1 = f * C + i * C_add #dimension m
    o = sigmoid(T.dot(hx, Wo) + bo) #dimension m
    h1 = o * T.tanh(C1) #dimension m
    return [C1, h1] #dimension 2m (as 2 separate lists)

#the same function, but with the parameters grouped together.
#R^n -> R^m -> R^m -> Parameters -> (R^m, R^m)
def step_lstm(x, C, h, tparams): 
    Wf, bf, Wi, bi, WC, bC, Wo, bo = unpack_params(tparams, ["Wf", "bf", "Wi", "bi", "WC", "bC", "Wo", "bo"])
    return step_lstm1(x, C, h, Wf, bf, Wi, bi, WC, bC, Wo, bo)

#Now for scanning and mapping
#1. unfold step_lstm into something that accepts and gives a sequence
#2. make it something that will operate on a whole batch of sequences
#R^m -> R^m -> R^{s x n} -> Parameters -> (R^{s x m}, R^{s x m})
def sequence_lstm(C0, h0, xs, tparams):
    #we need tparams because we need a link to the shared variables.
    #CHECK: please check that this gives the weights in the right order.
    print("in sequence_lstm")
    print('tparams', tparams)
    Wf, bf, Wi, bi, WC, bC, Wo, bo = unpack_params(tparams, ["Wf", "bf", "Wi", "bi", "WC", "bC", "Wo", "bo"])
    #the function fn should have arguments in the following order:
    #sequences, outputs_info (accumulators), non_sequences
    #(x, C, h, Wf, bf, Wi, bi, WC, bC, Wo, bo)
    ([C_vals, h_vals], updates) = theano.scan(fn=step_lstm1,
                                          sequences = xs, 
                                          outputs_info=[C0, h0], #initial values of the memory/accumulator
                                          non_sequences=[Wf, bf, Wi, bi, WC, bC, Wo, bo]) #fixed parameters
                                          #strict=True)
    return [C_vals, h_vals]

#play around with numpy to see how things map, to define step_multiple_lstm.

def sequence_multiple_lstm1(Cs0, hs0, xss, tparams):
    return tmap(f, [Cs0, hs0, xss], tparams)

def step_multiple_lstm(xs, C, h, tparams):
    #Everything maps automatically. (We've only used matrix multiplication and scalar functions like sigmoid.)
    return step_lstm(xs, C, h, tparams)

def sequence_multiple_lstm(Cs0, hs0, xss, tparams):
    #Everything maps. Note xss, Cs0, hs0 must be Theano matrices, not a list of Theano lists.
    #However, the input will be $((R^n)^k)^s$ so we need to switch axes.
    #Dimensions count inwards        2  1  0
    xss2 = xss.dimshuffle([1,0,2]) #ERROR. Check above logic.
    return sequence_lstm(Cs0, hs0, xss2, tparams)

"""Functions to evaluate the NN's and calculate loss"""
#unmapped version. taking indices
def fns_lstm(C0, h0, xis, yi, tparams1, tparams2):
    #, last_only = True):
    #evaluate the LSTM on this sequence
    print('tparams1', tparams1)
    [C_vals, h_vals] = sequence_lstm(C0, h0, xis, tparams1)
    #it's simpler to get both the function for the last and the function for all
    """ 
    if last_only:
        h_vals = h_vals[-1]
        C_vals = C_vals[-1]
    """
    #feed into the neural net and get vector of activations
    acts = nn_layer(h_vals, tparams2) #
    #prediction is the argmax value. Take argmax along innermost (-1) axis
    pred = T.argmax(acts, axis=-1)
    #loss function
    loss = logloss(acts, yi)
    acts_last = acts[-1]
    pred_last = pred[-1]
    loss_last = loss[-1]
    #1 if predicted next one correctly, 0 otherwise
    acc_last = yi[pred_last]
    return acts_last, pred_last, loss_last, acc_last, acts, pred, loss

#is ALMOST THE SAME as above...
def fns_multiple_lstm(b,m, xis, yi, (tparams1, tparams2)):
    #C0 = T.matrix("C0")
    #h0 = T.matrix("h0")
    #ERROR: https://groups.google.com/forum/#!topic/theano-users/fSgdabbhmDg
    #C0.tag.test_value = np.zeros((b,m))
    #h0.tag.test_value = np.zeros((b,m))
    C0 = T.as_tensor_variable(np.zeros((b,m))) #as tensor variable
    h0 = T.as_tensor_variable(np.zeros((b,m)))
    #evaluate the LSTM on this sequence
    #? xis is b*s*n matrix, where n is dim of space of inputs, s is sequence length, and b is number in batch. 
    # Check ordering. 
    [C_vals, h_vals] = sequence_lstm(C0, h0, xis, tparams1)
    # feed into the neural net and get vector of activations
    # acts is b*s*n matrix (? hope it maps) Does this mean n is the same?
    acts = nn_layer(h_vals, tparams2) #ERROR
    # prediction is the argmax value. Take argmax along innermost (-1) axis
    # pred is b*s vector
    pred = T.argmax(acts, axis=-1)
    #loss function
    # loss is b*s matrix
    loss = logloss(acts, yi)
    # acts_last is b*s matrix 
    acts_last = acts[:,-1] #ERROR: replace ... with :
    pred_last = pred[:,-1]
    loss_last = loss[:,-1]
    #1 if predicted next one correctly, 0 otherwise
    #http://stackoverflow.com/questions/33929368/how-to-perform-a-range-on-a-theanos-tensorvariable
    print("fns_multiple_lstm")
    print(xis.shape)
    print(yi)
    print(pred_last)
    print(xis.shape[0])
    # (0,pred_last[0]), (1, pred_last[1]),...
    acc_last = yi[T.arange(xis.shape[0]),pred_last]
    #http://stackoverflow.com/questions/23435782/numpy-selecting-specific-column-index-per-row-by-using-a-list-of-indexes
    return acts_last, pred_last, loss_last, acc_last, acts, pred, loss

#return dictionary of parameters (with default random initialization)
def init_params_lstm(n, m, init='zeros'):
    #normalize this!
    rand_f = lambda l1, l2: np.asarray(np.random.normal(0, 1/np.sqrt(n), (l1,l2)))
    (f, g) = case(init,
                [('zeros', (np.zeros, lambda c, r: np.zeros((c, r)))),
                 ('rand', (np.zeros, rand_f))])
    return init_params_with_f_lstm(n,m,f,g)

#returns a dictionary of parameters, initialized using the functions f and g.
def init_params_with_f_lstm(n,m,f,g):
    pairs = [("Wf",np.asarray(g(m+n, m))),
             ("bf",np.asarray(f(m))),
             ("Wi",np.asarray(g(m+n, m))),
             ("bi",np.asarray(f(m))),
             ("WC",np.asarray(g(m+n, m))),
             ("bC",np.asarray(f(m))),
             ("Wo",np.asarray(g(m+n, m))), #ERROR
             ("bo",np.asarray(f(m)))]
    """
    pairs = [("Wf",T.as_tensor_variable(np.asarray(g(m+n, m)))),
             ("bf",T.as_tensor_variable(np.asarray(f(m)))),
             ("Wi",T.as_tensor_variable(np.asarray(g(m+n, m)))),
             ("bi",T.as_tensor_variable(np.asarray(f(m)))),
             ("WC",T.as_tensor_variable(np.asarray(g(m+n, m)))),
             ("bC",T.as_tensor_variable(np.asarray(f(m)))),
             ("Wo",T.as_tensor_variable(np.asarray(g(m+n, n)))),
             ("bo",T.as_tensor_variable(np.asarray(f(m))))]
             """
    return OrderedDict(pairs)

"""
#Int^b -> R^{l * n} -> (R^{b * s * n}, R^{b * n})
def get_data_f(indices, data):
    #given indices, get the sequences in data starting at those indices.
    #(seqs, ys)\in R^{b*s*n} * R^{b*n}
    return ([data[i:i+s] for i in indices], [data[i+s-1] for i in indices])
    #does s include last? 
"""
def get_data_f(li, batch_ids):
    print(batch_ids)
    _, bids = batch_ids
    return ([li[i:i+s] for i in bids], [li[i+s] for i in bids])

#li's are sequences, ex. [0,3,2,1,1,3,1]
#the elements of the sequence are in [0..(n-1)], ex. n=4 above
#m is the memory size
#s is the sequence length, ex. 3 divides the above into [0,3,2],..,[1,3,1]. 
##li_test=[]
def train_lstm(li_train, li_valid, n, m, s, batch_size, valid_batch_size=-1):
    if valid_batch_size == -1:
        valid_batch_size = batch_size
    #turns li_train, etc. into one-hot vectors. (li_train is a list of characters.)
    hot_li_train, hot_li_valid = [map(lambda x: hot(n, x), li) for li in [li_train, li_valid]]
    ##hot_li_test
    #n_seqs_train, n_seqs_valid, n_seqs_test = [len(li) - s + 1 for li in [li_train, li_valid, li_test]]
    #note alternatively we can keep it as a single tensor...

    def batch_maker(batch_size, data):
        return get_minibatches_idx(len(data)-s, batch_size, shuffle=True)

    # note this gives a tuple right now.
    """
    def get_data_f(batch_ids, li):
        print(batch_ids)
        _, bids = batch_ids
        return ([[li[x] for x in range(i, i+s)] for i in bids], [li[i+s] for i in bids])
    """

    xis = T.dtensor3('xis')
    yi = T.dmatrix('yi')
    tparams1 = init_tparams(init_params_lstm(n,m,'rand'))
    tparams2 = init_tparams(init_params_nn(m,n,'rand'))
    #warning: locks in batch size number
    _,_,loss,acc,_,_,_ = fns_multiple_lstm(batch_size, m, xis, yi, (tparams1, tparams2))
    cost = loss.sum() #ERROR: add this
    print("cost",cost)
    err = 1 - acc
    #warning, these require m as argument.
    #loss_f = function([xis,yi],loss)
    #acc_f = function([xis,yi],acc)
    #ERROR: '' around keys
    arg_dict = {'init_params' : init_params_lstm(n, m),
                'data_train' : hot_li_train, 
                'data_valid' : hot_li_valid,
                'batch_maker' : batch_maker,
                'get_data_f' : get_data_f,
                'cost' : cost, 
                'pred_error' : err, 
                'args' : [xis,yi], 
                'tparamss' : [tparams1, tparams2], 
                'patience' : 10, 
                'max_epochs' : 5000, 
                'dispFreq' : 10, 
                'optimizer' : rmsprop,
                'saveto' : 'lstm.npz',
                'validFreq' : 500,
                'saveFreq' : 1000,
                'batch_size' : 16,
                'valid_batch_size' : 64}
                
    train(**arg_dict)


In [404]:
train_lstm([0,0,0,1,1,1,2,2,2,3,3,3,3], [0,0,1,1,2,2,3,3], 4, 3, 3, 2)

in sequence_lstm
('tparams', OrderedDict([('Wf', Wf), ('bf', bf), ('Wi', Wi), ('bi', bi), ('WC', WC), ('bC', bC), ('Wo', Wo), ('bo', bo)]))
('unpack', OrderedDict([('Wf', Wf), ('bf', bf), ('Wi', Wi), ('bi', bi), ('WC', WC), ('bC', bC), ('Wo', Wo), ('bo', bo)]), ['Wf', 'bf', 'Wi', 'bi', 'WC', 'bC', 'Wo', 'bo'], <class 'collections.OrderedDict'>, <type 'list'>)
<TensorType(float64, matrix)>
xis[t]
('unpack', OrderedDict([('W', W), ('b', b)]), ['W', 'b'], <class 'collections.OrderedDict'>, <type 'list'>)

INFO (theano.gof.compilelock): Refreshing lock /n/homeserver2/user2a/holdenl/.theano/compiledir_Linux-2.6-el6.x86_64-x86_64-with-redhat-6.7-Pisa-x86_64-2.7.10-64/lock_dir/lock
INFO:theano.gof.compilelock:Refreshing lock /n/homeserver2/user2a/holdenl/.theano/compiledir_Linux-2.6-el6.x86_64-x86_64-with-redhat-6.7-Pisa-x86_64-2.7.10-64/lock_dir/lock



fns_multiple_lstm
Shape.0
yi
Subtensor{::, int64}.0
Subtensor{int64}.0
('cost', Sum{acc_dtype=float64}.0)
Building model
grad
Sum{acc_dtype=float64}.0
[Wf, bf, b, WC, bC, Wo, bo, bi, Wi, W]
Optimization
13 train examples
8 valid examples
('batch_ids', [(0, array([6, 8, 3, 2, 7, 1, 5, 9, 0, 4], dtype=int32))])
('batch_id', (0, array([6, 8, 3, 2, 7, 1, 5, 9, 0, 4], dtype=int32)))
(0, array([6, 8, 3, 2, 7, 1, 5, 9, 0, 4], dtype=int32))


ValueError: all the input array dimensions except for the concatenation axis must match exactly
Apply node that caused the error: Join(TensorConstant{1}, <TensorType(float64, matrix)>, xis[t])
Toposort index: 0
Inputs types: [TensorType(int8, scalar), TensorType(float64, matrix), TensorType(float64, matrix)]

HINT: Use another linker then the c linker to have the inputs shapes and strides printed.
HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
Apply node that caused the error: forall_inplace,cpu,scan_fn}(Shape_i{0}.0, Subtensor{int64:int64:int8}.0, IncSubtensor{InplaceSet;:int64:}.0, IncSubtensor{InplaceSet;:int64:}.0, Wf, Wi, WC, Wo, InplaceDimShuffle{x,0}.0, InplaceDimShuffle{x,0}.0, InplaceDimShuffle{x,0}.0, InplaceDimShuffle{x,0}.0)
Toposort index: 129
Inputs types: [TensorType(int64, scalar), TensorType(float64, 3D), TensorType(float64, 3D), TensorType(float64, 3D), TensorType(float64, matrix), TensorType(float64, matrix), TensorType(float64, matrix), TensorType(float64, matrix), TensorType(float64, row), TensorType(float64, row), TensorType(float64, row), TensorType(float64, row)]
Inputs shapes: [(), (10, 3, 4), (11, 2, 3), (11, 2, 3), (7, 3), (7, 3), (7, 3), (7, 3), (1, 3), (1, 3), (1, 3), (1, 3)]
Inputs strides: [(), (96, 32, 8), (48, 24, 8), (48, 24, 8), (24, 8), (24, 8), (24, 8), (24, 8), (24, 8), (24, 8), (24, 8), (24, 8)]
Inputs values: [array(10), 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', array([[ 0.,  0.,  0.]]), array([[ 0.,  0.,  0.]]), array([[ 0.,  0.,  0.]]), array([[ 0.,  0.,  0.]])]
Outputs clients: [[Subtensor{int64:int64:int64}(forall_inplace,cpu,scan_fn}.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{-1}), Subtensor{int64:int64:int64}(forall_inplace,cpu,scan_fn}.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{-1})], [Subtensor{int64:int64:int8}(forall_inplace,cpu,scan_fn}.1, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1}), Subtensor{int64:int64:int64}(forall_inplace,cpu,scan_fn}.1, ScalarFromTensor.0, ScalarFromTensor.0, Constant{-1})]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

In [16]:
np.zeros((2,3))

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [21]:
D0 = T.matrix("D0")
D0.tag.test_value = np.zeros((2,3))

In [25]:
k = T.iscalar("k")
A = T.vector("A")

# Symbolic description of the result
result, updates = theano.scan(fn=lambda prior_result, A: prior_result * A,
                              outputs_info=T.ones_like(A),
                              non_sequences=A,
                              n_steps=k)

final_result = result[-1]

# compiled function that returns A**k
power = theano.function(inputs=[A,k], outputs=final_result, updates=updates)

print(power(range(10),2))
print(power(range(10),4))

[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]
[  0.00000000e+00   1.00000000e+00   1.60000000e+01   8.10000000e+01
   2.56000000e+02   6.25000000e+02   1.29600000e+03   2.40100000e+03
   4.09600000e+03   6.56100000e+03]


In [28]:
k = T.iscalar("k")
A = np.asarray(range(10))

# Symbolic description of the result
result, updates = theano.scan(fn=lambda prior_result, A: prior_result * A,
                              outputs_info=np.ones(10),
                              non_sequences=A,
                              n_steps=k)

final_result = result[-1]

# compiled function that returns A**k
power = theano.function(inputs=[k], outputs=final_result, updates=updates)

print(power(2))
print(power(4))

[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]
[  0.00000000e+00   1.00000000e+00   1.60000000e+01   8.10000000e+01
   2.56000000e+02   6.25000000e+02   1.29600000e+03   2.40100000e+03
   4.09600000e+03   6.56100000e+03]


In [336]:
m=2
n=3

tparams1 = init_params_lstm(n,m,'rand')
tparams2 = init_params_nn(m,n,'rand')

C0 = np.zeros(m)
h0 = np.zeros(m)

xis = T.dmatrix('xis')
yi = T.dvector('yi')


In [337]:
acts_last, pred_last, loss_last, acc_last, acts, pred, loss = fns_lstm(C0, h0, xis, yi, tparams1, tparams2)

#https://groups.google.com/forum/#!topic/theano-users/74LA8It6ouI

('tparams1', OrderedDict([('Wf', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bf', array([ 0.,  0.])), ('Wi', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bi', array([ 0.,  0.])), ('WC', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bC', array([ 0.,  0.])), ('Wo', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bo', array([ 0.,  0.]))]))
in sequence_lstm
('tparams', OrderedDict([('Wf', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bf', array([ 0.,  0.])), ('Wi', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bi', array([ 0.,  0.])), ('WC', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bC', array([ 0.,  0.])), ('Wo',

In [72]:
def clog(x):
    return T.log(T.maximum(1e-6,x)) #ERROR: not max

In [73]:
x = T.dvector('x')
#x.tag.test_value = np.asarray([1,2])
f = theano.function([x],tmap(clog,x))

In [74]:
g = theano.function([x],clog(x))

In [75]:
g([2,3])

array([ 0.69314718,  1.09861229])

In [76]:
f([2,3])

array([ 0.69314718,  1.09861229])

In [77]:
g([0,1,2])

array([-13.81551056,   0.        ,   0.69314718])

In [338]:
tparams2 = init_params_nn(m,n,'rand')
tparams2

OrderedDict([('W', array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])), ('b', array([ 0.,  0.,  0.]))])

In [339]:
W, b = unpack_params(tparams2, ["W", "b"])

('unpack', OrderedDict([('W', array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])), ('b', array([ 0.,  0.,  0.]))]), ['W', 'b'], <class 'collections.OrderedDict'>, <type 'list'>)


In [340]:
ds = [{'c':1},{'a':2,'b':1}]
d = union(*ds)
d

{'a': 2, 'b': 1, 'c': 1}

In [142]:
from collections import OrderedDict
import cPickle as pkl
import sys
import time

import numpy
import theano
from theano import config
import theano.tensor as tensor
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams


In [177]:
s = 3


def batch_maker(batch_size, data):
        return get_minibatches_idx(len(data)-s, batch_size, shuffle=True)

# note this gives a tuple right now.
def get_data_f(batch_ids, li):
    _, bids = batch_ids
    return ([[li[x] for x in range(i, i+s)] for i in bids], [li[i+s] for i in bids])

In [178]:
bids = batch_maker(2, range(20))
print(bids)
bid = bids[0]
print(bid)
_, bs = bid
print(bs)
print(get_data_f(bid, range(20)))

[(0, array([ 3, 12], dtype=int32)), (1, array([ 8, 16], dtype=int32)), (2, array([ 7, 11], dtype=int32)), (3, array([ 4, 15], dtype=int32)), (4, array([ 1, 10], dtype=int32)), (5, array([14, 13], dtype=int32)), (6, array([2, 5], dtype=int32)), (7, array([6, 0], dtype=int32)), (8, array([9], dtype=int32))]
(0, array([ 3, 12], dtype=int32))
[ 3 12]
([[3, 4, 5], [12, 13, 14]], [6, 15])


In [163]:
get_minibatches_idx(6,3)

[(0, array([0, 1, 2], dtype=int32)), (1, array([3, 4, 5], dtype=int32))]

In [None]:
def get_minibatches_idx(n, minibatch_size, shuffle=False):
    """
    Used to shuffle the dataset at each iteration.
    """
    #idx_list = [0..(n-1)]
    idx_list = numpy.arange(n, dtype="int32")
    
    if shuffle:
        numpy.random.shuffle(idx_list)

    minibatches = []
    minibatch_start = 0
    for i in range(n // minibatch_size):
        minibatches.append(idx_list[minibatch_start:
                                    minibatch_start + minibatch_size])
        minibatch_start += minibatch_size

    if (minibatch_start != n):
        # Make a minibatch out of what is left
        minibatches.append(idx_list[minibatch_start:])

    return zip(range(len(minibatches)), minibatches)

In [329]:
m=2
n=3
tparams1 = init_tparams(init_params_lstm(n,m,'rand'))
tparams2 = init_tparams(init_params_nn(m,n,'rand'))
xis = T.dmatrix('xis')
yi = T.dvector('yi')
C0 = T.as_tensor_variable(np.zeros(m)) #as tensor variable
h0 = T.as_tensor_variable(np.zeros(m))
#print(type(tparams1['Wo']))
#print(type(tparams2['W']))
Wf, bf, Wi, bi, WC, bC, Wo, bo = unpack_params(tparams1, ["Wf", "bf", "Wi", "bi", "WC", "bC", "Wo", "bo"])
#print(type(Wf))
acts_last, pred_last, loss_last, acc_last, acts, pred, loss = fns_lstm(C0, h0, xis, yi, tparams1, tparams2)

('unpack', OrderedDict([('Wf', Wf), ('bf', bf), ('Wi', Wi), ('bi', bi), ('WC', WC), ('bC', bC), ('Wo', Wo), ('bo', bo)]), ['Wf', 'bf', 'Wi', 'bi', 'WC', 'bC', 'Wo', 'bo'], <class 'collections.OrderedDict'>, <type 'list'>)
('tparams1', OrderedDict([('Wf', Wf), ('bf', bf), ('Wi', Wi), ('bi', bi), ('WC', WC), ('bC', bC), ('Wo', Wo), ('bo', bo)]))
in sequence_lstm
('tparams', OrderedDict([('Wf', Wf), ('bf', bf), ('Wi', Wi), ('bi', bi), ('WC', WC), ('bC', bC), ('Wo', Wo), ('bo', bo)]))
('unpack', OrderedDict([('Wf', Wf), ('bf', bf), ('Wi', Wi), ('bi', bi), ('WC', WC), ('bC', bC), ('Wo', Wo), ('bo', bo)]), ['Wf', 'bf', 'Wi', 'bi', 'WC', 'bC', 'Wo', 'bo'], <class 'collections.OrderedDict'>, <type 'list'>)
<TensorType(float64, vector)>
xis[t]
('unpack', OrderedDict([('W', W), ('b', b)]), ['W', 'b'], <class 'collections.OrderedDict'>, <type 'list'>)


In [330]:
f=theano.function([xis],acts_last)

In [331]:
f([[0,0,1], [0,1,0]])

array([ 0.,  0.,  0.])

In [269]:
f= theano.function([x], nn_layer(x, tparams2))

('unpack', OrderedDict([('W', W), ('b', b)]), ['W', 'b'], <class 'collections.OrderedDict'>, <type 'list'>)


In [270]:
f([0,1])

array([ 0.,  0.,  0.])

In [271]:
tparams2['W'].get_value()

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [272]:
f = theano.function([x], step_lstm(x, C0, h0, tparams1))

('unpack', OrderedDict([('Wf', Wf), ('bf', bf), ('Wi', Wi), ('bi', bi), ('WC', WC), ('bC', bC), ('Wo', Wo), ('bo', bo)]), ['Wf', 'bf', 'Wi', 'bi', 'WC', 'bC', 'Wo', 'bo'], <class 'collections.OrderedDict'>, <type 'list'>)
TensorConstant{(2,) of 0.0}
x


In [273]:
f([0,0,1])

[array([ 0.,  0.,  0.]), array([ 0.,  0.,  0.])]

In [274]:
Wf, bf, Wi, bi, WC, bC, Wo, bo = unpack_params(tparams1, ["Wf", "bf", "Wi", "bi", "WC", "bC", "Wo", "bo"])
h=T.as_tensor_variable(np.zeros(m))
C=T.as_tensor_variable(np.zeros(m))
hx = T.concatenate([h,x],axis=-1) #dimension m+n
f = sigmoid(T.dot(hx, Wf) + bf) #dimension m
i = sigmoid(T.dot(hx, Wi) + bi) #dimension m
C_add = T.tanh(T.dot(hx, WC) + bC) #dimension m
C1 = f * C + i * C_add #dimension m
o = sigmoid(T.dot(hx, Wo) + bo) #dimension m
h1 = o * T.tanh(C1) #dimension m

f1 = theano.function([x], C1)
f2 = theano.function([x], h1)


('unpack', OrderedDict([('Wf', Wf), ('bf', bf), ('Wi', Wi), ('bi', bi), ('WC', WC), ('bC', bC), ('Wo', Wo), ('bo', bo)]), ['Wf', 'bf', 'Wi', 'bi', 'WC', 'bC', 'Wo', 'bo'], <class 'collections.OrderedDict'>, <type 'list'>)


In [275]:
print(hx.eval({x: [0,0,1]}))
o.eval({x: [0,0,1]})

[ 0.  0.  0.  0.  1.]


array([ 0.5,  0.5,  0.5])

In [276]:
f1([0,0,1])

array([ 0.,  0.,  0.])

In [277]:
f2([0,0,1])

array([ 0.,  0.,  0.])

In [333]:
xs = T.dmatrix('xs')
Cval, hval = sequence_lstm(C0, h0, xs, tparams1)

in sequence_lstm
('tparams', OrderedDict([('Wf', Wf), ('bf', bf), ('Wi', Wi), ('bi', bi), ('WC', WC), ('bC', bC), ('Wo', Wo), ('bo', bo)]))
('unpack', OrderedDict([('Wf', Wf), ('bf', bf), ('Wi', Wi), ('bi', bi), ('WC', WC), ('bC', bC), ('Wo', Wo), ('bo', bo)]), ['Wf', 'bf', 'Wi', 'bi', 'WC', 'bC', 'Wo', 'bo'], <class 'collections.OrderedDict'>, <type 'list'>)
<TensorType(float64, vector)>
xs[t]


In [334]:
Cval.eval({xs: [[0,0,1],[0,0,1]]})

array([[ 0.,  0.],
       [ 0.,  0.]])

In [282]:
([C_vals, h_vals], updates) = theano.scan(fn=step_lstm1,
                                          sequences = xs, 
                                          outputs_info=[C0, h0], #initial values of the memory/accumulator
                                          non_sequences=[Wf, bf, Wi, bi, WC, bC, Wo, bo], #fixed parameters
                                          strict=True)

<TensorType(float64, vector)>
xs[t]




In [335]:
Wf, bf, Wi, bi, WC, bC, Wo, bo = unpack_params(tparams1, ["Wf", "bf", "Wi", "bi", "WC", "bC", "Wo", "bo"])
h=T.as_tensor_variable(np.zeros(m))
C=T.as_tensor_variable(np.zeros(m))
hx = T.concatenate([h,x],axis=-1) #dimension m+n
f = sigmoid(T.dot(hx, Wf) + bf) #dimension m
i = sigmoid(T.dot(hx, Wi) + bi) #dimension m
C_add = T.tanh(T.dot(hx, WC) + bC) #dimension m
C1 = f * C + i * C_add #dimension m
o = sigmoid(T.dot(hx, Wo) + bo) #dimension m
h1 = o * T.tanh(C1) #dimension m

f1 = theano.function([x], C1)
f2 = theano.function([x], h1)
f1([0,0,1])
C1.eval({x:[0,0,1]})

('unpack', OrderedDict([('Wf', Wf), ('bf', bf), ('Wi', Wi), ('bi', bi), ('WC', WC), ('bC', bC), ('Wo', Wo), ('bo', bo)]), ['Wf', 'bf', 'Wi', 'bi', 'WC', 'bC', 'Wo', 'bo'], <class 'collections.OrderedDict'>, <type 'list'>)


array([ 0.,  0.])

In [321]:
#def step_lstm1(x, C, h, Wf, bf, Wi, bi, WC, bC, Wo, bo):
print(h)
print(x)
hx = T.concatenate([h,x],axis=-1) #dimension m+n
f = sigmoid(T.dot(hx, Wf) + bf) #dimension m
i = sigmoid(T.dot(hx, Wi) + bi) #dimension m
C_add = T.tanh(T.dot(hx, WC) + bC) #dimension m
C1 = f * C + i * C_add #dimension m
o = sigmoid(T.dot(hx, Wo) + bo) #dimension m
h1 = o * T.tanh(C1) #dimension m
    #return [C1, h1] #dimension 2m (as 2 separate lists)
print(hx.eval({x:[0,0,1]}))  
print(C_add.eval({x:[0,0,1]}))
print(C1.eval({x:[0,0,1]}))
print(h1.eval({x:[0,0,1]}))
WC.eval()

TensorConstant{(2,) of 0.0}
x
[ 0.  0.  0.  0.  1.]
[ 0.  0.  0.]
[ 0.  0.  0.]
[ 0.  0.  0.]


array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [287]:
hx = T.concatenate([h0,x])

In [288]:
hx.eval({x:[0,0,1]})

array([ 0.,  0.,  0.,  0.,  1.])

In [293]:
# define a named function, rather than using lambda
def accumulate_by_adding(arange_val, sum_to_date):
    return sum_to_date + arange_val
seq = T.dmatrix("seq")

# An unauthorized implicit downcast from the dtype of 'seq', to that of
# 'T.as_tensor_variable(0)' which is of dtype 'int8' by default would occur
# if this instruction were to be used instead of the next one:
# outputs_info = T.as_tensor_variable(0)

#outputs_info = T.as_tensor_variable(np.asarray(0, seq.dtype))
outputs_info = T.as_tensor_variable(np.zeros(2))
scan_result, scan_updates = theano.scan(fn=accumulate_by_adding,
                                        outputs_info=outputs_info,
                                        sequences=seq)
seqf = theano.function(inputs=[seq], outputs=scan_result)

In [294]:
seqf([[1,2],[4,5]])

array([[ 1.,  2.],
       [ 5.,  7.]])

In [309]:
def accumulate_by_adding(arange_val, sum1, sum2):
    return [sum1 + arange_val, sum2 + 2*arange_val]
seq = T.dvector("seq")

# An unauthorized implicit downcast from the dtype of 'seq', to that of
# 'T.as_tensor_variable(0)' which is of dtype 'int8' by default would occur
# if this instruction were to be used instead of the next one:
# outputs_info = T.as_tensor_variable(0)

#outputs_info = T.as_tensor_variable(np.asarray(0, seq.dtype))
outputs_info = T.as_tensor_variable(np.asarray(0, seq.dtype))
scan_result, scan_updates = theano.scan(fn=accumulate_by_adding,
                                        outputs_info=[outputs_info, outputs_info],
                                        sequences=seq)
seqf = theano.function(inputs=[seq], outputs=scan_result)
seqf([2,3,4])

[array([ 2.,  5.,  9.]), array([  4.,  10.,  18.])]

In [403]:
b=2
m=2
xis = T.dtensor3('xis')
yi = T.dmatrix('yi')
acts_last, pred_last, loss_last, acc_last, acts, pred, loss = fns_multiple_lstm(b,m, xis, yi, (tparams1, tparams2))
print(loss_last.eval({xis: [[[0,0,1],[0,0,1]], [[0,1,0],[0,0,1]]], yi: [[0,0,1],[0,1,0]]}))
acts_last.eval({xis: [[[0,0,1],[0,0,1]]]})
# , yi: [[0,0,1],[0,1,0]]}

in sequence_lstm
('tparams', OrderedDict([('Wf', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bf', array([ 0.,  0.])), ('Wi', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bi', array([ 0.,  0.])), ('WC', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bC', array([ 0.,  0.])), ('Wo', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bo', array([ 0.,  0.]))]))
('unpack', OrderedDict([('Wf', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bf', array([ 0.,  0.])), ('Wi', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bi', array([ 0.,  0.])), ('WC', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bC', array([ 0.,  0.])), ('Wo', a

array([[ 0.,  0.,  0.]])

In [None]:
acts_last.eval({xis: [[[0,0,1],[0,0,1]], [[0,1,0],[0,0,1]]]})

In [383]:
C0 = T.as_tensor_variable(np.zeros((b,m))) #as tensor variable
h0 = T.as_tensor_variable(np.zeros((b,m)))
#evaluate the LSTM on this sequence
#? xis is b*s*n matrix, where n is dim of space of inputs, s is sequence length, and b is number in batch. 
# Check ordering. 
[C_vals, h_vals] = sequence_multiple_lstm(C0, h0, xis, tparams1)

print(C_vals.eval({xis: [[[0,0,1],[0,0,1]],[[0,0,1],[0,0,1]]]}))
print(h_vals.eval({xis: [[[0,0,1],[0,0,1]],[[0,0,1],[0,0,1]]]}))

# feed into the neural net and get vector of activations
# acts is b*s*n matrix (? hope it maps) Does this mean n is the same?
acts = nn_layer(h_vals, tparams2) #ERROR

print(acts.eval({xis: [[[0,0,1],[0,0,1]],[[0,0,1],[0,0,1]]]}))

# prediction is the argmax value. Take argmax along innermost (-1) axis
# pred is b*s vector
pred = T.argmax(acts, axis=-1)

print(pred.eval({xis: [[[0,0,1],[0,0,1]], [[0,0,1],[0,0,1]]]}))

#loss function
# loss is b*s matrix
loss = logloss(acts, yi)

print(loss.eval({xis: [[[0,0,1],[0,0,1]],[[0,0,1],[0,0,1]]], yi: [[0,0,1],[1,0,0]]}))

# acts_last is b*s matrix 
acts_last = acts[:,-1] #ERROR: replace ... with :
print(acts_last.eval({xis: [[[0,0,1],[0,0,1]],[[0,0,1],[0,0,1]]]}))

pred_last = pred[:,-1]
print(pred_last.eval({xis: [[[0,0,1],[0,0,1]],[[0,0,1],[0,0,1]]], yi: [[0,0,1],[1,0,0]]}))

loss_last = loss[:,-1]
print(loss_last.eval({xis: [[[0,0,1],[0,0,1]],[[0,0,1],[0,0,1]]], yi: [[0,0,1],[1,0,0]]}))

#1 if predicted next one correctly, 0 otherwise
#http://stackoverflow.com/questions/33929368/how-to-perform-a-range-on-a-theanos-tensorvariable
print("fns_multiple_lstm")
print(xis.shape)
print(yi)
print(pred_last)
print(xis.shape[0])
# (0,pred_last[0]), (1, pred_last[1]),...
acc_last = yi[T.arange(xis.shape[0]),pred_last]
#http://stackoverflow.com/questions/23435782/numpy-selecting-specific-column-index-per-row-by-using-a-list-of-indexes
return acts_last, pred_last, loss_last, acc_last, acts, pred, loss


in sequence_lstm
('tparams', OrderedDict([('Wf', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bf', array([ 0.,  0.])), ('Wi', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bi', array([ 0.,  0.])), ('WC', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bC', array([ 0.,  0.])), ('Wo', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bo', array([ 0.,  0.]))]))
('unpack', OrderedDict([('Wf', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bf', array([ 0.,  0.])), ('Wi', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bi', array([ 0.,  0.])), ('WC', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bC', array([ 0.,  0.])), ('Wo', a

AssertionError: Theano Assert failed!
Apply node that caused the error: Assert{msg='Theano Assert failed!'}(TensorConstant{0.0}, Elemwise{eq,no_inplace}.0)
Toposort index: 8
Inputs types: [TensorType(float64, scalar), TensorType(int8, scalar)]

HINT: Use another linker then the c linker to have the inputs shapes and strides printed.
HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
Apply node that caused the error: forall_inplace,cpu,scan_fn}(Shape_i{1}.0, Subtensor{int64:int64:int8}.0, IncSubtensor{InplaceSet;:int64:}.0, IncSubtensor{InplaceSet;:int64:}.0)
Toposort index: 18
Inputs types: [TensorType(int64, scalar), TensorType(float64, 3D), TensorType(float64, 3D), TensorType(float64, 3D)]
Inputs shapes: [(), (2, 2, 3), (2, 2, 2), (1, 2, 2)]
Inputs strides: [(), (24, 48, 8), (32, 16, 8), (32, 16, 8)]
Inputs values: [array(2), 'not shown', 'not shown', array([[[ 0.,  0.],
        [ 0.,  0.]]])]
Outputs clients: [[Subtensor{int64:int64:int8}(forall_inplace,cpu,scan_fn}.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1})], []]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

In [372]:
xis2 = xis.dimshuffle([0,2,1])

In [373]:
xis2

DimShuffle{0,2,1}.0

In [376]:
xis.dimshuffle([0,2,1])
xis3 = xis

In [377]:
xis3.eval({xis: [[[0,0,1],[0,0,1]],[[0,1,0],[0,1,0]]]})

array([[[ 0.,  0.,  1.],
        [ 0.,  0.,  1.]],

       [[ 0.,  1.,  0.],
        [ 0.,  1.,  0.]]])

In [378]:
xis2.eval({xis: [[[0,0,1],[0,0,1]],[[0,1,0],[0,1,0]]]})

array([[[ 0.,  0.],
        [ 0.,  0.],
        [ 1.,  1.]],

       [[ 0.,  0.],
        [ 1.,  1.],
        [ 0.,  0.]]])

In [387]:
xis2 = xis.dimshuffle([1,0,2])
xis2.eval({xis: [[[0,0,1],[0,0,1]],[[0,1,0],[0,1,0]]]})

array([[[ 0.,  0.,  1.],
        [ 0.,  1.,  0.]],

       [[ 0.,  0.,  1.],
        [ 0.,  1.,  0.]]])

In [388]:
C0 = T.as_tensor_variable(np.zeros((b,m))) #as tensor variable
h0 = T.as_tensor_variable(np.zeros((b,m)))

Wf, bf, Wi, bi, WC, bC, Wo, bo = unpack_params(tparams1, ["Wf", "bf", "Wi", "bi", "WC", "bC", "Wo", "bo"])
    #the function fn should have arguments in the following order:
    #sequences, outputs_info (accumulators), non_sequences
    #(x, C, h, Wf, bf, Wi, bi, WC, bC, Wo, bo)
([C_vals, h_vals], updates) = theano.scan(fn=step_lstm1,
                                      sequences = xis2, 
                                      outputs_info=[C0, h0], #initial values of the memory/accumulator
                                      non_sequences=[Wf, bf, Wi, bi, WC, bC, Wo, bo]) #fixed parameters
                                      #strict=True)
C_vals.eval({xis: [[[0,0,1],[0,0,1]],[[0,1,0],[0,1,0]]]})

('unpack', OrderedDict([('Wf', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bf', array([ 0.,  0.])), ('Wi', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bi', array([ 0.,  0.])), ('WC', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bC', array([ 0.,  0.])), ('Wo', array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])), ('bo', array([ 0.,  0.]))]), ['Wf', 'bf', 'Wi', 'bi', 'WC', 'bC', 'Wo', 'bo'], <class 'collections.OrderedDict'>, <type 'list'>)
<TensorType(float64, matrix)>
<TensorType(float64, matrix)>


AssertionError: Theano Assert failed!
Apply node that caused the error: Assert{msg='Theano Assert failed!'}(TensorConstant{0.0}, Elemwise{eq,no_inplace}.0)
Toposort index: 8
Inputs types: [TensorType(float64, scalar), TensorType(int8, scalar)]

HINT: Use another linker then the c linker to have the inputs shapes and strides printed.
HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
Apply node that caused the error: forall_inplace,cpu,scan_fn}(Shape_i{1}.0, Subtensor{int64:int64:int8}.0, IncSubtensor{InplaceSet;:int64:}.0, IncSubtensor{InplaceSet;:int64:}.0)
Toposort index: 18
Inputs types: [TensorType(int64, scalar), TensorType(float64, 3D), TensorType(float64, 3D), TensorType(float64, 3D)]
Inputs shapes: [(), (2, 2, 3), (2, 2, 2), (1, 2, 2)]
Inputs strides: [(), (24, 48, 8), (32, 16, 8), (32, 16, 8)]
Inputs values: [array(2), 'not shown', 'not shown', array([[[ 0.,  0.],
        [ 0.,  0.]]])]
Outputs clients: [[Subtensor{int64:int64:int8}(forall_inplace,cpu,scan_fn}.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1})], []]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

In [397]:
#step for matrices
h=h0
C=C0
x=T.dmatrix('x')
print(h)
print(x)
hx = T.concatenate([h,x], axis=-1) #dimension m+n
f = sigmoid(T.dot(hx, Wf) + bf) #dimension m
i = sigmoid(T.dot(hx, Wi) + bi) #dimension m
C_add = T.tanh(T.dot(hx, WC) + bC) #dimension m
C1 = f * C + i * C_add #dimension m
o = sigmoid(T.dot(hx, Wo) + bo) #dimension m
h1 = o * T.tanh(C1) #dimension m
    #return [C1, h1] #dimension 2m (as 2 separate lists)
print(hx.eval({x: [[0,0,1],[0,1,0]]}))  
print(C_add.eval({x: [[0,0,1],[0,1,0]]}))
print(C1.eval({x: [[0,0,1],[0,1,0]]}))
print(h1.eval(({x: [[0,0,1],[0,1,0]]})))
WC.eval()

TensorConstant{(2, 2) of 0.0}
x
[[ 0.  0.  0.  0.  1.]
 [ 0.  0.  0.  1.  0.]]
[[ 0.  0.]
 [ 0.  0.]]
[[ 0.  0.]
 [ 0.  0.]]

INFO (theano.gof.compilelock): Refreshing lock /n/homeserver2/user2a/holdenl/.theano/compiledir_Linux-2.6-el6.x86_64-x86_64-with-redhat-6.7-Pisa-x86_64-2.7.10-64/lock_dir/lock
INFO:theano.gof.compilelock:Refreshing lock /n/homeserver2/user2a/holdenl/.theano/compiledir_Linux-2.6-el6.x86_64-x86_64-with-redhat-6.7-Pisa-x86_64-2.7.10-64/lock_dir/lock



[[ 0.  0.]
 [ 0.  0.]]


AttributeError: 'numpy.ndarray' object has no attribute 'eval'

In [396]:
a1 = T.dmatrix('a1')
b1 = T.dmatrix('b1')
c=T.concatenate([a1,b1], axis =1)
c.eval({a1:[[1],[2]],b1: [[1,2],[3,4]]})

array([[ 1.,  1.,  2.],
       [ 2.,  3.,  4.]])

In [None]:
#it counts outside in.