In [7]:
%autoreload 2

In [13]:
# to make life easier, import all the needed stuff from workspace
# With autoreload enabled this also has the added benefit of making imported functionality
# available to the notebook without the need to rerun the importing cell.
from workspace import *
# Below line is supposed to be `from fuel.transformers.sequences import NGrams`...
# But it doesn't work :(
from contrib.sequences import NGrams

## Data Prep

In [2]:
from itertools import chain
from collections import Counter
import numpy as np
from fuel.datasets import Dataset


def build_word_id_dict(word_seq, cutoff=5):
    counts = Counter(word_seq)
    frequent_enough = (word for word in counts if counts[word] >= cutoff)
    return dict((word, w_id + 1) for w_id, word in enumerate(frequent_enough))

def convert_to_ids(word_seq, word_id_dict):
    return np.array([word_id_dict[word] for word in word_seq])

def words_to_ids(vocab):
    def converter(word_seq):
        return convert_to_ids(word_seq, vocab)
    return converter

def nltk_corpus_ids(nltk_corpus):
    corpus_sents = [" ".join(sent) for sent in nltk_corpus]
    vocab = build_word_id_dict(chain(*corpus_sents), cutoff=1)
    w2id = words_to_ids(vocab)    
    return (np.array(list(map(w2id, corpus_sents))), vocab)

In [3]:
from nltk.corpus import brown
brown_sents = brown.sents()

brown_char_ids, brown_char_vocab = nltk_corpus_ids(brown_sents)

In [4]:
class NgramDataset(Dataset):  
    
    def __init__(self, indexable):
#         if len(sources) < 2:
#             raise ValueError("Can't handle more than 2 sources atm")
        
        self.provides_sources = ("inputs", "targets")
#         self.provides_sources = ("inputs",)
        # must be a numpy array
        self.indexable = indexable
#         self.vocab_size = vocab_size
        
        # needed for some technical reason
        self.axis_labels = None        
    
    def get_data(self, state=None, request=None):
        if state is not None or request is None:
            raise ValueError
        
        requested_items = []
        for indx in request:
            sent = self.indexable[indx]
            ngrams = ((sent[i], sent[i + 1]) for i in range(len(sent) - 1))
            item = tuple(map(np.array, tuple(zip(*ngrams))))
            requested_items.append(item)
        return np.array(requested_items, ndmin=2).swapaxes(1, 0)#.transpose((2, 0, 1))
#         if isinstance(request, int):
#             return np.array([self.indexable[request]], ndmin=2)
#         return np.array([self.indexable[i] for i in request], ndmin=2)
    
    def _ngram(index):
        return (self.indexable[index], self.indexable[index + 1])
    
    @property
    def num_examples(self):
        return len(self.indexable)
    
#     def _to_1_hot(self, char_id):
        # use this to turn an intex (3rd arg) into a one-hot vector
        # the length of which is defined by the second arg
#         return np.eye(1, self.vocab_size, char_id, dtype=np.int_)[0]

from fuel.datasets import IndexableDataset
def ngram_dataset(indexable):
    return NgramDataset(indexable)
#     ds.sources = ('inputs',)
#     return ds

In [5]:
train_len = 40000
brown_train_dataset = ngram_dataset(brown_char_ids[:train_len])
brown_test_dataset = ngram_dataset(brown_char_ids[train_len:])

In [6]:
def transposer(data):
#     data is a tuple, since it's expected to come from the padding transformer
    return tuple(np.swapaxes(item, 0, 1) for item in data)

In [7]:
def char_stream(dataset):
    scheme = SequentialExampleScheme(dataset.num_examples)
    batch_scheme = SequentialScheme(dataset.num_examples, batch_size=15)
    just_stream = DataStream.default_stream(dataset, iteration_scheme=batch_scheme)
#     ngrams = NGrams(1, just_stream)
#     return ngrams
#     return Padding(Batch(ngrams, batch_scheme))
    padded = Padding(just_stream)#, mask_sources=('inputs',))
    return padded
#     return Mapping(padded, transposer)
#     return just_stream

In [8]:
brown_train_stream = char_stream(brown_train_dataset)
brown_test_stream = char_stream(brown_test_dataset)

In [9]:
# n = brown_train_stream.get_epoch_iterator()
# x = brown_train_stream.get_data(request=[0,2,3])
# x = brown_train_stream.get_data()

## Model Definition

In [10]:
HIDDEN_SIZE = 100
VOCAB_SIZE = len(brown_char_vocab)

In [21]:
inputs = tensor.lmatrix('inputs')
states = tensor.dmatrix("h_0")
input_mask = tensor.dmatrix('inputs_mask')
true_targets = tensor.lmatrix("targets")

In [29]:
layers = [
    LookupTable(dim=HIDDEN_SIZE, length=VOCAB_SIZE, weights_init=IsotropicGaussian(0.01),name='2vec'),
    Bias(HIDDEN_SIZE,
         name="input_bias",
         biases_init=Constant(0)),
    SimpleRecurrent(dim=HIDDEN_SIZE,
                    activation=Rectifier(),
                    name="recurrent",
                    weights_init=IsotropicGaussian(0.01)),
    Linear(HIDDEN_SIZE, VOCAB_SIZE,
           weights_init=IsotropicGaussian(0.01),
           biases_init=Constant(0),
           name="h2o"),
#     NDimensionalSoftmax(name="softmax"),
]

seq_brick = Sequence([l.apply for l in layers])
seq_brick.initialize()
linear_output = seq_brick.apply(inputs)

In [23]:
# input_layer = input_transition.apply(one_hot.apply(inputs))
# # input_layer = one_hot.apply(inputs)
# hidden_layer = recurrent_transition.apply(input_layer)
# output_layer = output_non_linear.apply(output_transition.apply(hidden_layer), extra_ndim=1)

In [30]:
def softmax_3d(linear_output):
    softmax = NDimensionalSoftmax(name="softmax")
    raw_cce = softmax.categorical_cross_entropy(true_targets, linear_output, extra_ndim=1)
    drop_missing = raw_cce * input_mask
    return drop_missing.sum() / input_mask.sum()

In [31]:
cost = softmax_3d(linear_output)

In [32]:
graph = ComputationGraph(cost)

In [33]:
# Cost optimization
optimizer = GradientDescent(cost=cost, parameters=graph.parameters,
                            step_rule=Adam())

In [34]:
# Data Streams
# training_stream = DataStream.default_stream(brown_train_dataset,
#                                             iteration_scheme=SequentialScheme(range(len(brown_train_dataset.indexable)-1), batch_size=50000))
# test_stream = DataStream.default_stream(brown_test_dataset,
#                                         iteration_scheme=SequentialScheme(range(len(brown_test_dataset.indexable)-1), batch_size=100000))
# Monitoring
monitor = DataStreamMonitoring(variables=[cost], 
                               data_stream=brown_test_stream, prefix="test")

# Main Loop
main_loop = MainLoop(data_stream=brown_train_stream, algorithm=optimizer,
                     extensions=[monitor,
                                     FinishAfter(after_n_epochs=1),
                                     Printing(),
#                                      Plot("Example Plot", channels=[['test_cost_simple_xentropy', "test_error_rate"]])
                                ])
main_loop.run()

IndexError: index 84 is out of bounds for size 84
Apply node that caused the error: AdvancedSubtensor1(W, Reshape{1}.0)
Toposort index: 23
Inputs types: [TensorType(float64, matrix), TensorType(int64, vector)]
Inputs shapes: [(84, 100), (2190,)]
Inputs strides: [(800, 8), (8,)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[Reshape{3}(AdvancedSubtensor1.0, MakeVector{dtype='int64'}.0)]]

Backtrace when the node is created(use Theano flag traceback.limit=N to make it longer):
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2885, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-29-625dae8f21ac>", line 19, in <module>
    linear_output = seq_brick.apply(inputs)
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/blocks/bricks/base.py", line 369, in __call__
    return self.application.apply(self, *args, **kwargs)
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/blocks/bricks/base.py", line 304, in apply
    outputs = self.application_function(brick, *args, **kwargs)
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/blocks/bricks/sequences.py", line 35, in apply
    output = application_method(*pack(child_input))
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/blocks/bricks/base.py", line 369, in __call__
    return self.application.apply(self, *args, **kwargs)
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/blocks/bricks/base.py", line 304, in apply
    outputs = self.application_function(brick, *args, **kwargs)
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/blocks/bricks/lookup.py", line 68, in apply
    return self.W[indices.flatten()].reshape(output_shape)

HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

Original exception:
	IndexError: index 84 is out of bounds for size 84
Apply node that caused the error: AdvancedSubtensor1(W, Reshape{1}.0)
Toposort index: 23
Inputs types: [TensorType(float64, matrix), TensorType(int64, vector)]
Inputs shapes: [(84, 100), (2190,)]
Inputs strides: [(800, 8), (8,)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[Reshape{3}(AdvancedSubtensor1.0, MakeVector{dtype='int64'}.0)]]

Backtrace when the node is created(use Theano flag traceback.limit=N to make it longer):
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2885, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-29-625dae8f21ac>", line 19, in <module>
    linear_output = seq_brick.apply(inputs)
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/blocks/bricks/base.py", line 369, in __call__
    return self.application.apply(self, *args, **kwargs)
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/blocks/bricks/base.py", line 304, in apply
    outputs = self.application_function(brick, *args, **kwargs)
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/blocks/bricks/sequences.py", line 35, in apply
    output = application_method(*pack(child_input))
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/blocks/bricks/base.py", line 369, in __call__
    return self.application.apply(self, *args, **kwargs)
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/blocks/bricks/base.py", line 304, in apply
    outputs = self.application_function(brick, *args, **kwargs)
  File "/home/quickbeam/ilia_dev/python/miniconda/envs/py3-deep-learning/lib/python3.5/site-packages/blocks/bricks/lookup.py", line 68, in apply
    return self.W[indices.flatten()].reshape(output_shape)

HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

## Playing with Main Loops

In [11]:
from word_embeddings import main_loop

In [3]:
main_loop.run()


-------------------------------------------------------------------------------
BEFORE FIRST EPOCH
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 0
	 iterations_done: 0
	 received_first_batch: False
	 resumed_from: None
	 training_started: True
Log records from the iteration 0:
	 test_simple_entropy_apply_cost: 5.717057105039682


-------------------------------------------------------------------------------
AFTER ANOTHER EPOCH
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: False
	 epochs_done: 1
	 iterations_done: 23
	 received_first_batch: True
	 resumed_from: None
	 training_started: True
Log records from the iteration 23:
	 test_simple_entropy_apply_cost: 5.430238805431428
	 training_finish_requ