## Data Prep

In [1]:
from collections import Counter
import string

punctuation_symbols = set(string.punctuation)
punctuation_symbols.add("``")
punctuation_symbols.add("''")
punctuation_symbols.add('""')

def preprocess_clean(text):
    return [w.lower() for w in text if w not in punctuation_symbols]

def build_word_id_dict(word_seq, cutoff=5):
    counts = Counter(word_seq)
    frequent_enough = (word for word in counts if counts[word] >= cutoff)
    return dict((word, w_id) for w_id, word in enumerate(frequent_enough))

def convert_to_ids(word_seq, word_id_dict):
    return np.array([word_id_dict[word] for word in word_seq])

In [28]:
import numpy as np
from fuel.datasets import Dataset

class W2VecDataset(Dataset):
    
    def __init__(self, sources, indexable, context_len):
        if len(sources) < 2:
            raise ValueError("Can't handle more than 2 sources atm")
        
        # must be len 2
        self.provides_sources = sources
        # must be a numpy array
        self.indexable = indexable
        self.N = context_len
        self.axis_labels = None        
    
    @property
    def example_indices(self):
        return list(range(self.N, len(self.indexable) - self.N))
    
    @property
    def num_examples(self):
        return len(self.example_indices)
    
    def get_data(self, state=None, request=None):
        if state is not None or request is None:
            raise ValueError
        
        by_item = map(self._get_items, request)
        contexts, targets = tuple(zip(*by_item))
        return (np.array(contexts), np.array(targets))
        
    def _get_items(self, index):
        context_indeces = np.array(range(index - self.N, index + self.N + 1))
        # remove the index itself from context indices
        context_indeces = context_indeces[context_indeces != index]
        try:
            return (self.indexable[context_indeces], self.indexable[index])
        except IndexError:
            raise IndexError("{0}, {1}".format(str(context_indeces), str(index)))

def make_w2vec_dataset(indexable_seq):
    return W2VecDataset(('contexts', 'targets'), indexable_seq, 2)

In [11]:
from nltk.corpus import brown

In [12]:
corp_len = 10000
training_len = 4600
wrds = brown.words()[:corp_len]

In [13]:
clean_words = preprocess_clean(wrds)

vocab = build_word_id_dict(clean_words)

filtered_words = [w for w in clean_words if w in vocab]

word_ids = convert_to_ids(filtered_words, vocab)

In [29]:
training_dataset = make_w2vec_dataset(word_ids[:training_len])
test_dataset = make_w2vec_dataset(word_ids[training_len:])

In [24]:
len(test_dataset.indexable)

1081

In [26]:
test_dataset.example_indices[-1]

1078

In [19]:
training_dataset.indexable[4597:]

array([130,  80, 248])

In [16]:
len(training_dataset.indexable)

4600

In [27]:
str(np.array([1,2,3]))

'[1 2 3]'

## Network Definition

Revisit regularization: how does it work?


In [7]:
# Theano stuff
from theano import tensor as T

# All sorts of bricks
from blocks.bricks.lookup import LookupTable
from blocks.bricks import Linear, Softmax
from blocks.graph import ComputationGraph
from blocks.initialization import IsotropicGaussian, Constant
from blocks.algorithms import GradientDescent, Scale
from blocks.bricks.cost import CategoricalCrossEntropy, MisclassificationRate
# Data Streams and monitoring
from fuel.streams import DataStream
from fuel.schemes import SequentialScheme, SequentialExampleScheme
from blocks.extensions.monitoring import DataStreamMonitoring
# Main Loop
from blocks.main_loop import MainLoop
from blocks.extensions import FinishAfter, Printing
from blocks_extras.extensions.plot import Plot

In [31]:
hidden_size = 100
vocab_size = len(vocab)

# Network layers
# Not sure I should keep this as input?
input_layer = T.imatrix('contexts')
input_to_projection = LookupTable(vocab_size,
                                    hidden_size,
                                    weights_init=IsotropicGaussian(0.01),
                                    biases_init=Constant(0),
                                    name="projection")
projection_layer = T.mean(input_to_projection.apply(input_layer), axis=1)
projection_layer.name = 'projection'
projection_to_ouput = Linear(name='output',
                              weights_init=IsotropicGaussian(0.01),
                              biases_init=Constant(0),
                              input_dim=hidden_size,
                              output_dim=vocab_size)
probs = Softmax().apply(projection_to_ouput.apply(projection_layer))

# Cost Function, Graph
true_targets = T.ivector('targets')
cost = CategoricalCrossEntropy(name='x_entropy_cost').apply(true_targets, probs)
graph = ComputationGraph(cost)

# Other metrics
# not sure this will work...
# error_rate = MisclassificationRate().apply(probs, true_targets)

# Parameter Initialization
# Idea: annotate layers that need initialization and select them
input_to_projection.initialize()
projection_to_ouput.initialize()

# Cost optimization
optimizer = GradientDescent(cost=cost, parameters=graph.parameters,
                            step_rule=Scale(learning_rate=0.025))

# Data Streams
training_stream = DataStream.default_stream(training_dataset,
                                            iteration_scheme=SequentialScheme(training_dataset.example_indices, batch_size=20))
test_stream = DataStream.default_stream(test_dataset,
                                        iteration_scheme=SequentialScheme(test_dataset.example_indices, batch_size=20))
# Monitoring
monitor = DataStreamMonitoring(variables=[cost], 
                               data_stream=test_stream, prefix="test")

# Main Loop
main_loop = MainLoop(data_stream=training_stream, algorithm=optimizer,
                     extensions=[monitor,
                                     FinishAfter(after_n_epochs=1),
                                     Printing(),
#                                      Plot("Example Plot", channels=[['test_cost_simple_xentropy', "test_error_rate"]])
                                ])
main_loop.run()


-------------------------------------------------------------------------------
BEFORE FIRST EPOCH
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: True
	 epochs_done: 0
	 iterations_done: 0
	 received_first_batch: False
	 resumed_from: None
	 training_started: True
Log records from the iteration 0:
	 test_x_entropy_cost_apply_cost: 5.717001761934335


-------------------------------------------------------------------------------
AFTER ANOTHER EPOCH
-------------------------------------------------------------------------------
Training status:
	 batch_interrupt_received: False
	 epoch_interrupt_received: False
	 epoch_started: False
	 epochs_done: 1
	 iterations_done: 230
	 received_first_batch: True
	 resumed_from: None
	 training_started: True
Log records from the iteration 230:
	 test_x_entropy_cost_apply_cost: 5.587235289088982
	 training_finish_re