Merge pull request #60 from i-machine-think/callback-with-tasks

Callbacks, Tasks, Language Model, and plenty more suprises..
i-machine-think · Feb 3, 2019 · 4ed6458 · 4ed6458
2 parents 2b5c30f + 7001a11
commit 4ed6458
Show file tree

Hide file tree

Showing 62 changed files with 2,454 additions and 297 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 *.pt
 *.pyc
 *.swp
+machine/tasks/*/data/
+machine/tasks/LongLookupTables/*/data/
diff --git a/.travis.yml b/.travis.yml
@@ -3,7 +3,6 @@ sudo: false
 language: python
 cache: pip
 python:
-  - "2.7"
   - "3.6"
 
 notifications:
@@ -13,7 +12,6 @@ notifications:
 install:
   - pip install -U pip
   - pip -q install -r requirements.txt
-  - pip install matplotlib
 
   # dev dependencies
   - pip install flake8

diff --git a/evaluate.py b/evaluate.py
@@ -10,7 +10,8 @@
 from machine.dataset import SourceField, TargetField
 from machine.evaluator import Evaluator
 from machine.trainer import SupervisedTrainer
-from machine.util.checkpoint import Checkpoint
+from machine.util import Checkpoint
+from machine.util.callbacks import Callback
 
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -89,13 +90,22 @@ def len_filter(example):
     return len(example.src) <= max_len and len(example.tgt) <= max_len
 
 
+def get_standard_batch_iterator(data, batch_size):
+    return torchtext.data.BucketIterator(
+        dataset=data, batch_size=batch_size,
+        sort=False, sort_within_batch=True,
+        sort_key=lambda x: len(x.src),
+        device=device, repeat=False)
+
+
 # generate test set
 test = torchtext.data.TabularDataset(
     path=opt.test_data, format='tsv',
     fields=tabular_data_fields,
     filter_pred=len_filter
 )
 
+test_iterator = get_standard_batch_iterator(test, opt.batch_size)
 # Prepare loss and metrics
 pad = output_vocab.stoi[tgt.pad_token]
 losses = [NLLLoss(ignore_index=pad)]
@@ -123,10 +133,9 @@ def len_filter(example):
 ##########################################################################
 # Evaluate model on test set
 
-evaluator = Evaluator(batch_size=opt.batch_size, loss=losses, metrics=metrics)
-losses, metrics = evaluator.evaluate(
-    model=seq2seq, data=test, get_batch_data=data_func)
+evaluator = Evaluator(loss=losses, metrics=metrics)
+losses, metrics = evaluator.evaluate(seq2seq, test_iterator, data_func)
 
-total_loss, log_msg, _ = SupervisedTrainer.get_losses(losses, metrics, 0)
+total_loss, log_msg, _ = Callback.get_losses(losses, metrics, 0)
 
 logging.info(log_msg)
diff --git a/machine/dataset/__init__.py b/machine/dataset/__init__.py
@@ -1 +1,2 @@
 from .fields import SourceField, TargetField
+from .get_standard_iter import get_standard_iter
diff --git a/machine/dataset/get_standard_iter.py b/machine/dataset/get_standard_iter.py
@@ -0,0 +1,23 @@
+import torch
+import torchtext
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def get_standard_iter(data, batch_size=64, device=None):
+    """
+    Helper function to get the batch iter from a torchtext dataset
+    Args:
+        data (torchtext Dataset)
+        batch_size (int, optional)
+        device (torch.device, optional): if need to force data
+                                        to be run on specific device
+    """
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    return torchtext.data.BucketIterator(
+        dataset=data, batch_size=batch_size,
+        sort=False, sort_within_batch=True,
+        sort_key=lambda x: len(x.src),
+        device=device, repeat=False)
diff --git a/machine/evaluator/evaluator.py b/machine/evaluator/evaluator.py
@@ -1,4 +1,5 @@
 from __future__ import print_function, division
+import copy
 
 import torch
 import torchtext
@@ -14,16 +15,17 @@ class Evaluator(object):
 
     Args:
         loss (machine.loss, optional): loss for evaluator (default: machine.loss.NLLLoss)
-        batch_size (int, optional): batch size for evaluator (default: 64)
+        metrics (machine.metrics, optional): metrics for evaluator (default
+            machine.metrics.WordAccuracy and SequenceAccuracy )
     """
 
     def __init__(self, loss=[NLLLoss()], metrics=[
-                 WordAccuracy(), SequenceAccuracy()], batch_size=64):
+                 WordAccuracy(), SequenceAccuracy()]):
         self.losses = loss
         self.metrics = metrics
-        self.batch_size = batch_size
 
-    def update_batch_metrics(self, metrics, other, target_variable):
+    @staticmethod
+    def update_batch_metrics(metrics, other, target_variable):
         """
         Update a list with metrics for current batch.
 
@@ -67,7 +69,8 @@ def compute_batch_loss(self, decoder_outputs,
 
         return losses
 
-    def update_loss(self, losses, decoder_outputs,
+    @staticmethod
+    def update_loss(losses, decoder_outputs,
                     decoder_hidden, other, target_variable):
         """
         Update a list with losses for current batch
@@ -88,39 +91,43 @@ def update_loss(self, losses, decoder_outputs,
 
         return losses
 
-    def evaluate(self, model, data, get_batch_data):
+    def evaluate(self, model, data_iterator, get_batch_data):
         """ Evaluate a model on given dataset and return performance.
 
         Args:
             model (machine.models): model to evaluate
-            data (machine.dataset.dataset.Dataset): dataset to evaluate against
+            data_iterator (torchtext.data.Iterator): data iterator to evaluate against
 
         Returns:
             loss (float): loss of the given model on the given dataset
             accuracy (float): accuracy of the given model on the given dataset
         """
         # If the model was in train mode before this method was called, we make sure it still is
         # after this method.
+
+        # Since we are passing data_iterator
+        # We evaluate on whole batches - so exhaust all batches first
+        # and store the initial point
+        # data_iterator_reset = False
+        initial_iteration = data_iterator.iterations
+        if initial_iteration > 1 and initial_iteration != len(data_iterator):
+            raise Warning("Passed in data_iterator in middle of iterations")
+
         previous_train_mode = model.training
         model.eval()
 
-        losses = self.losses
-        for loss in losses:
+        for loss in self.losses:
             loss.reset()
+        losses = copy.deepcopy(self.losses)
 
-        metrics = self.metrics
-        for metric in metrics:
+        for metric in self.metrics:
             metric.reset()
-
-        # create batch iterator
-        batch_iterator = torchtext.data.BucketIterator(
-            dataset=data, batch_size=self.batch_size,
-            sort=True, sort_key=lambda x: len(x.src),
-            device=device, train=False)
+        metrics = copy.deepcopy(self.metrics)
 
         # loop over batches
         with torch.no_grad():
-            for batch in batch_iterator:
+            for batch in data_iterator:
+
                 input_variable, input_lengths, target_variable = get_batch_data(
                     batch)
 

diff --git a/machine/loss/loss.py b/machine/loss/loss.py
@@ -140,7 +140,7 @@ def __init__(self, ignore_index=-1, size_average=True):
 
         super(NLLLoss, self).__init__(
             self._NAME, self._SHORTNAME, self._INPUTS, self._TARGETS,
-            nn.NLLLoss(ignore_index=ignore_index, reduction='elementwise_mean' if size_average else 'sum'))
+            nn.NLLLoss(ignore_index=ignore_index, reduction='mean' if size_average else 'sum'))
 
     def get_loss(self):
         if isinstance(self.acc_loss, int):

diff --git a/machine/models/EncoderRNN.py b/machine/models/EncoderRNN.py
@@ -45,28 +45,39 @@ def __init__(self, vocab_size, max_len, hidden_size, embedding_size,
         self.variable_lengths = variable_lengths
         self.embedding = nn.Embedding(vocab_size, embedding_size)
         self.rnn = self.rnn_cell(embedding_size, hidden_size, n_layers,
-                                 batch_first=True, bidirectional=bidirectional, dropout=dropout_p)
+                                 batch_first=True, bidirectional=bidirectional,
+                                 dropout=dropout_p)
 
-    def forward(self, input_var, input_lengths=None):
+    def forward(self, input_var, hidden=None, input_lengths=None):
         """
         Applies a multi-layer RNN to an input sequence.
 
         Args:
             input_var (batch, seq_len): tensor containing the features of the input sequence.
             input_lengths (list of int, optional): A list that contains the lengths of sequences
               in the mini-batch
-
+            **hidden** : Tuple of (h_0, c_0), each of shape (num_layers * num_directions, batch, hidden_size)
+              where h_0 is tensor containing the initial hidden state, and c_0 is a tensor
+              containing the initial cell state for for each element in the batch. 
+              If none is provided then defaults to zero
         Returns: output, hidden
             - **output** (batch, seq_len, hidden_size): variable containing the encoded features of the input sequence
             - **hidden** (num_layers * num_directions, batch, hidden_size): variable containing the features in the hidden state h
         """
         embedded = self.embedding(input_var)
         embedded = self.input_dropout(embedded)
+
         if self.variable_lengths:
             embedded = nn.utils.rnn.pack_padded_sequence(
                 embedded, input_lengths, batch_first=True)
-        output, hidden = self.rnn(embedded)
+
+        if hidden is not None:
+            output, hidden = self.rnn(embedded, hidden)
+        else:
+            output, hidden = self.rnn(embedded)
+
         if self.variable_lengths:
             output, _ = nn.utils.rnn.pad_packed_sequence(
                 output, batch_first=True)
+
         return output, hidden
diff --git a/machine/models/LanguageModel.py b/machine/models/LanguageModel.py
@@ -1,15 +1,79 @@
-
-
 from .baseModel import BaseModel
+from .EncoderRNN import EncoderRNN
+
+import torch.nn as nn
 
 
 class LanguageModel(BaseModel):
     """
     Implements a language model
+
+    Args:
+        encoder_module (EncoderRNN): Encoder to use
+        tie_weights (bool, optional): Whether to tie embedding weights to decoder weights
+        dropout_p_decoder (float, optional): dropout prob of decoder
+
+    Inputs: inputs, hidden
+        - **inputs**: list of sequences, whose length is the batch size and within which each sequence is a list of token IDs.
+        - **hidden** : Tuple of (h_0, c_0), each of shape (num_layers * num_directions, batch, hidden_size)
+              where h_0 is tensor containing the initial hidden state, and c_0 is a tensor
+              containing the initial cell state for for each element in the batch. 
+
+    Outputs: output, hidden
+        - **output** (batch, seq_len, hidden_size): tensor containing the decoded features of the input sequence
+        - **hidden** (num_layers * num_directions, batch, hidden_size): tensor containing the features in the hidden state `h`
     """
 
+    def __init__(self, encoder_module, tie_weights=False, dropout_p_decoder=0.5):
+
+        super(LanguageModel, self).__init__(encoder_module=encoder_module)
+
+        self.decoder_dropout = nn.Dropout(dropout_p_decoder)
+        self.decoder = nn.Linear(
+            self.encoder_module.hidden_size, self.encoder_module.vocab_size)
+
+        # Optionally tie weights as in:
+        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
+        # https://arxiv.org/abs/1608.05859
+        # and
+        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
+        # https://arxiv.org/abs/1611.01462
+        if tie_weights:
+            if self.encoder_module.embedding_size != self.encoder_module.hidden_size:
+                raise ValueError(
+                    'When using the tied flag, encoder embedding_size must be equal to hidden_size')
+            self.decoder.weight = self.encoder_module.embedding.weight
+
+        self.init_weights()
+
+        self.hidden_size = self.encoder_module.hidden_size
+        self.n_layers = self.encoder_module.n_layers
+
     def flatten_parameters(self):
-        raise NotImplementedError("Function should be implemented")
+        """
+        Flatten parameters of all reccurrent components in the model.
+        """
+        self.encoder_module.rnn.flatten_parameters()
+
+    def init_weights(self):
+        """
+        Standard weight initialization
+        """
+        initrange = 0.1
+        self.encoder_module.embedding.weight.data.uniform_(
+            -initrange, initrange)
+        self.decoder.bias.data.zero_()
+        self.decoder.weight.data.uniform_(-initrange, initrange)
+
+    def forward(self, input, hidden):
+        output, hidden = self.encoder_module(input, hidden=hidden)
+        output = self.decoder_dropout(output)
+        decoded = self.decoder(output.contiguous().view(-1, output.size(2)))
+
+        return decoded.view(output.size(0), output.size(1),
+                            decoded.size(1)), hidden
 
-    def forward(self, inputs, input_lengths=None):
-        raise NotImplementedError("Language model should be implemented")
+    def init_hidden(self, batch_size):
+        weight = next(self.parameters())
+        return (weight.new_zeros(self.n_layers, batch_size, self.hidden_size),
+                weight.new_zeros(self.n_layers, batch_size, self.hidden_size))
diff --git a/machine/models/TopKDecoder.py b/machine/models/TopKDecoder.py
@@ -297,8 +297,8 @@ def _backtrack(self, nw_output, nw_hidden, predecessors,
                     # Indices of the EOS symbol for both variables
                     # with b*k as the first dimension, and b, k for
                     # the first two dimensions
-                    idx = eos_indices[i]
-                    b_idx = int(idx[0] / self.k)
+                    idx = eos_indices[i].item()
+                    b_idx = int(idx / self.k)
                     # The indices of the replacing position
                     # according to the replacement strategy noted above
                     res_k_idx = self.k - (batch_eos_found[b_idx] % self.k) - 1
@@ -307,21 +307,25 @@ def _backtrack(self, nw_output, nw_hidden, predecessors,
 
                     # Replace the old information in return variables
                     # with the new ended sequence information
-                    t_predecessors[res_idx] = predecessors[t][idx[0]]
-                    current_output[res_idx, :] = nw_output[t][idx[0], :]
+
+                    # TODO: Check this still works (this if was added for
+                    #      torch 1.0 but might have unforseen consequences)
+                    if t_predecessors.dim() > 0:
+                        t_predecessors[res_idx] = predecessors[t][idx]
+                    else:
+                        t_predecessors = predecessors[t][idx]
+
+                    current_output[res_idx, :] = nw_output[t][idx, :]
                     if lstm:
-                        current_hidden[0][:, res_idx,
-                                          :] = nw_hidden[t][0][:, idx[0], :]
-                        current_hidden[1][:, res_idx,
-                                          :] = nw_hidden[t][1][:, idx[0], :]
-                        h_n[0][:, res_idx, :] = nw_hidden[t][0][:, idx[0], :].data
-                        h_n[1][:, res_idx, :] = nw_hidden[t][1][:, idx[0], :].data
+                        current_hidden[0][:, res_idx, :] = nw_hidden[t][0][:, idx, :]
+                        current_hidden[1][:, res_idx, :] = nw_hidden[t][1][:, idx, :]
+                        h_n[0][:, res_idx, :] = nw_hidden[t][0][:, idx, :].data
+                        h_n[1][:, res_idx, :] = nw_hidden[t][1][:, idx, :].data
                     else:
-                        current_hidden[:, res_idx,
-                                       :] = nw_hidden[t][:, idx[0], :]
-                        h_n[:, res_idx, :] = nw_hidden[t][:, idx[0], :].data
-                    current_symbol[res_idx, :] = symbols[t][idx[0]]
-                    s[b_idx, res_k_idx] = scores[t][idx[0]].item()
+                        current_hidden[:, res_idx, :] = nw_hidden[t][:, idx, :]
+                        h_n[:, res_idx, :] = nw_hidden[t][:, idx, :].data
+                    current_symbol[res_idx, :] = symbols[t][idx]
+                    s[b_idx, res_k_idx] = scores[t][idx].item()
                     l[b_idx][res_k_idx] = t + 1
 
             # record the back tracked results

diff --git a/machine/models/__init__.py b/machine/models/__init__.py
@@ -3,3 +3,4 @@
 from .TopKDecoder import TopKDecoder
 from .seq2seq import Seq2seq
 from .baseModel import BaseModel
+from .LanguageModel import LanguageModel
diff --git a/machine/models/seq2seq.py b/machine/models/seq2seq.py
@@ -25,7 +25,7 @@ def forward(self, inputs, input_lengths=None, targets={},
         target_output = targets.get('decoder_output', None)
 
         encoder_outputs, encoder_hidden = self.encoder_module(
-            inputs, input_lengths)
+            inputs, input_lengths=input_lengths)
         result = self.decoder_module(inputs=target_output,
                                      encoder_hidden=encoder_hidden,
                                      encoder_outputs=encoder_outputs,