Merge pull request #61 from gautierdag/docs

BUG + DOC: Updated documentation and changed train_model to use latest
i-machine-think · Feb 11, 2019 · feb7883 · feb7883
2 parents fc20a1c + 3e4b8b9
commit feb7883
Show file tree

Hide file tree

Showing 16 changed files with 87 additions and 76 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -29,5 +29,5 @@ script:
   # Unit test
   - python -m unittest discover -v
   - coverage run -m unittest discover
-  - coverage report -m
+  - coverage report --skip-covered --omit '*/virtualenv/*'
   - sh integration_test.sh
diff --git a/README.md b/README.md
@@ -4,19 +4,19 @@
 
 # Introduction
 
-This is a pytorch implementation of a sequence to sequence learning toolkit for the i-machine-think project. This repository is a fork from the pytorch-seq2seq library developed by IBM, but has substantially diverged from it after heavy development. For the original implementation, visit [https://github.com/IBM/pytorch-seq2seq](https://github.com/IBM/pytorch-seq2seq).
+This is a pytorch implementation of a sequence to sequence learning toolkit for the i-machine-think project. This repository was originally a fork from the pytorch-seq2seq library developed by IBM, but has substantially diverged from it after heavy development. For the original implementation, visit [https://github.com/IBM/pytorch-seq2seq](https://github.com/IBM/pytorch-seq2seq).
 
 # Requirements
 
-This library runs with PyTorch 0.4.0. We refer to the [PyTorch website](http://pytorch.org/) to install the right version for your environment.
+This library runs with PyTorch 1.0.0. We refer to the [PyTorch website](http://pytorch.org/) to install the right version for your environment.
 To install additional requirements (including numpy and torchtext), run:
 
-`pip install -r requirements.txt`
+`pip3 install -r requirements.txt`
+
 
 # Quickstart
 
-To use machine, clone the repository and do an editable install with pip.
-Although machine is compatible with python2.7, we recommend you use python3.
+To use machine, clone the repository and do an editable install with pip. Note that `machine` requires python 3.5 or higher (python 2.7 is no longer supported).
 
 ```
 git clone https://github.com/i-machine-think/machine.git $path_to_machine
@@ -35,11 +35,25 @@ Furthermore, we included 3 commandline tools in the repository that demonstrate
 
 The script `train_model.py` can be used to train a new model, resume the training of an existing model from a checkpoint, or retrain an existing model from a checkpoint. E.g. to train a model from scratch:
 
-     # Train a simple model with hidden layer size 128 and embedding size 128
-    `python train_model.py --train $train_path --dev $dev_path --output_dir $expt_dir  --embedding_size 128 --hidden_size 256 --rnn_cell gru --epoch 20 
+```
+# Train a simple model with hidden layer size 128 and embedding size 128
+python train_model.py --train $train_path --dev $dev_path --output_dir $expt_dir  --embedding_size 128 --hidden_size 256 --rnn_cell gru --epoch 20 
+```
+
+Several options are available from the command line, including changing the optimizer, batch size, using attention/bidirectionality and using teacher forcing. For a complete overview, use the *help* function of the script.
+
+### Tasks
+
+Some tasks come included in machine (will download autmatically if called). Task objects can be thought as wrappers for the metadata of datasets (filenames etc.). You can import a task such a the lookup task by doing the following:
+
+```
+from machine.tasks import get_task
+
+T = get_task("lookup", is_mini=True)
+``` 
+
+Please see `machine.tasks` for more info on how this works. Default parameters can be set for tasks in `.yaml` files which allow for quick replication of results and tracking of changes. These parameters are loaded in the task object so you can call `T.default_params['name_of_parameter_set']`, where the name of the parameter set depends on what is in the `yaml` file. This allows different default parameter sets for the same tasks.
 
-Several options are available from the command line, including changing the optimizer, batch size, using attention/bidirectionality and using teacher forcing. 
-For a complete overview, use the *help* function of the script.
 
 ## Evaluation and inference
 

diff --git a/evaluate.py b/evaluate.py
@@ -74,8 +74,8 @@
 
 ############################################################################
 # Prepare dataset and loss
-src = SourceField()
-tgt = TargetField(output_eos_used)
+src = SourceField(batch_first=True)
+tgt = TargetField(output_eos_used, batch_first=True)
 
 tabular_data_fields = [('src', src), ('tgt', tgt)]
 

diff --git a/example.sh b/example.sh
@@ -15,10 +15,10 @@ TF=0.5
 
 # Start training
 echo "Train model on example data"
-python train_model.py --train $TRAIN_PATH --output_dir $EXPT_DIR --print_every $PRINT_EVERY --embedding_size $EMB_SIZE --hidden_size $H_SIZE --rnn_cell $CELL --n_layers $N_LAYERS --epoch $EPOCH --print_every $PRINT_EVERY --teacher_forcing $TF --attention 'pre-rnn' --attention_method 'mlp'
+python3 train_model.py --train $TRAIN_PATH --output_dir $EXPT_DIR --print_every $PRINT_EVERY --embedding_size $EMB_SIZE --hidden_size $H_SIZE --rnn_cell $CELL --n_layers $N_LAYERS --epoch $EPOCH --print_every $PRINT_EVERY --teacher_forcing $TF --attention 'pre-rnn' --attention_method 'mlp'
 
 echo "\n\nEvaluate model on test data"
-python evaluate.py --checkpoint_path $EXPT_DIR/$(ls -t $EXPT_DIR/ | head -1) --test_data $TRAIN_PATH
+python3 evaluate.py --checkpoint_path $EXPT_DIR/$(ls -t $EXPT_DIR/ | head -1) --test_data $TRAIN_PATH
 
 echo "\n\nRun in inference mode"
-python infer.py --checkpoint_path $EXPT_DIR/$(ls -t $EXPT_DIR/ | head -1) 
+python3 infer.py --checkpoint_path $EXPT_DIR/$(ls -t $EXPT_DIR/ | head -1) 
diff --git a/infer.py b/infer.py
@@ -7,11 +7,6 @@
 from machine.evaluator import Predictor
 from machine.util.checkpoint import Checkpoint
 
-try:
-    raw_input          # Python 2
-except NameError:
-    raw_input = input  # Python 3
-
 parser = argparse.ArgumentParser()
 
 parser.add_argument('--checkpoint_path',
@@ -52,7 +47,7 @@
     exit()
 
 while True:
-    seq_str = raw_input("\n\nType in a source sequence: ")
+    seq_str = input("\n\nType in a source sequence: ")
     if seq_str == 'q':
         exit()
     seq = seq_str.strip().split()

diff --git a/integration_test.sh b/integration_test.sh
@@ -29,20 +29,20 @@ rm $EXPT_DIR/log_test
 
 # Resume training
 echo "\n\nTest resume training"
-python3 train_model.py --train $TRAIN_PATH --dev $DEV_PATH --resume --output_dir $EXPT_DIR --print_every 50 --embedding_size $EMB_SIZE --hidden_size $H_SIZE --rnn_cell $CELL --epoch $EPOCH --load_checkpoint $(ls -t $EXPT_DIR | head -1) --save_every $CP_EVERY --optim rmsprop --batch_size 6
+python3 train_model.py --train $TRAIN_PATH --dev $DEV_PATH --resume-training --output_dir $EXPT_DIR --print_every 50 --embedding_size $EMB_SIZE --hidden_size $H_SIZE --rnn_cell $CELL --epoch $EPOCH --load_checkpoint $(ls -t $EXPT_DIR | head -1) --save_every $CP_EVERY --optim rmsprop --batch_size 6
 ERR=$((ERR+$?)); EX=$((EX+1))
 
 echo "\n\nTest train from checkpoint"
 # Load checkpoint
 python3 train_model.py --train $TRAIN_PATH --dev $DEV_PATH --output_dir $EXPT_DIR --print_every 50 --epoch $EPOCH --embedding_size $EMB_SIZE --hidden_size $H_SIZE --rnn_cell $CELL --load_checkpoint $(ls -t $EXPT_DIR/ | head -1) --save_every $CP_EVERY
 ERR=$((ERR+$?)); EX=$((EX+1))
 
-# evaluate.py
+# # evaluate.py
 echo "\n\nTest evaluator"
 python3 evaluate.py --checkpoint_path $EXPT_DIR/$(ls -t $EXPT_DIR/ | head -1) --test_data $DEV_PATH --batch_size 15
 ERR=$((ERR+$?)); EX=$((EX+1))
 
-# test training without dev set
+#test training without dev set
 echo "\n\nTest training without dev set"
 python3 train_model.py --train $TRAIN_PATH --output_dir $EXPT_DIR --print_every 10 --embedding_size $EMB_SIZE --hidden_size $H_SIZE --rnn_cell $CELL --epoch $EPOCH --save_every $CP_EVERY
 ERR=$((ERR+$?)); EX=$((EX+1))

diff --git a/machine/trainer/supervised_trainer.py b/machine/trainer/supervised_trainer.py
@@ -135,7 +135,7 @@ def _train_epoches(self, data, n_epochs,
     def train(self, model, data,
               dev_data,
               num_epochs=5,
-              resume=False,
+              resume_training=False,
               monitor_data={},
               optimizer=None,
               teacher_forcing_ratio=0,
@@ -156,7 +156,7 @@ def train(self, model, data,
                overwritten by the model loaded from the latest checkpoint.
             data (torchtext.data.Iterator: torchtext iterator object to train on
             num_epochs (int, optional): number of epochs to run (default 5)
-            resume(bool, optional): resume training with the latest checkpoint, (default False)
+            resume_training(bool, optional): resume training with the latest checkpoint up until the number of epochs (default False)
             dev_data (torchtext.data.Iterator): dev/validation set iterator
                 Note: must not pass in the train iterator here as this gets evaluated during training (in between batches)
                 If you want to evaluate on the full train during training then make two iterators and pass the second one here
@@ -180,7 +180,7 @@ def train(self, model, data,
         self.set_local_parameters(random_seed, losses, metrics,
                                   loss_weights, checkpoint_every, print_every)
         # If training is set to resume
-        if resume:
+        if resume_training:
             resume_checkpoint = Checkpoint.load(checkpoint_path)
             model = resume_checkpoint.model
             self.model = model

diff --git a/machine/util/callbacks/callback_container.py b/machine/util/callbacks/callback_container.py
@@ -22,12 +22,13 @@ def set_trainer(self, trainer):
             callback.set_trainer(trainer)
 
     def set_info(self, start_step, start_epoch,
-                 steps_per_epoch, total_steps):
+                 steps_per_epoch, total_steps,
+                 step_elapsed=0):
         self.info['start_step'] = start_step
         self.info['step'] = start_step
         self.info['start_epoch'] = start_epoch
         self.info['epoch'] = start_epoch
-        self.info['step_elapsed'] = 0
+        self.info['step_elapsed'] = step_elapsed
         self.info['steps_per_epoch'] = steps_per_epoch
         self.info['total_steps'] = total_steps
         self.info['print'] = False
@@ -109,11 +110,15 @@ def on_train_end(self):
             callback.on_train_end(self.info)
 
             # Gets log object from History call back
-            if getattr(callback, logs):
+            if hasattr(callback, 'logs'):
                 logs = callback.logs
         return logs
 
     def _evaluate_model_on_validation(self):
+        # No dev_set
+        if self.trainer.val_data is None:
+            return [], []
+
         return self.trainer.evaluator.evaluate(self.trainer.model,
                                                self.trainer.val_data,
                                                self.trainer.get_batch_data)
diff --git a/machine/util/callbacks/logger.py b/machine/util/callbacks/logger.py
@@ -28,13 +28,15 @@ def on_epoch_begin(self, info=None):
         self.logger.info("Epoch: %d, Step: %d" % (info['epoch'], info['step']))
 
     def on_epoch_end(self, info=None):
-
         for loss in self.trainer.losses:
             self.epoch_loss_avg[loss.log_name] = \
                 self.epoch_loss_total[loss.log_name] \
-                / min(info['steps_per_epoch'], info['step'] - info['start_step'])
+                / max(min(info['steps_per_epoch'], info['step'] - info['start_step']), 1)
             self.epoch_loss_total[loss.log_name] = 0
 
+        if info['step_elapsed'] < 1:
+            self.logger.warning("0 Steps elapsed so avg. loss is 0")
+
         loss_msg = ' '.join(
             ['%s: %.4f' % (loss.log_name, self.epoch_loss_avg[loss.log_name]) for loss in self.trainer.losses])
 

diff --git a/test/test_checkpoint.py b/test/test_checkpoint.py
@@ -46,12 +46,12 @@ def test_save_checkpoint_calls_torch_save(
                                         os.path.join(chk_point.path, Checkpoint.TRAINER_STATE_NAME))
         mock_torch.save.assert_any_call(mock_model,
                                         os.path.join(chk_point.path, Checkpoint.MODEL_NAME))
-        self.assertEquals(2, mock_open.call_count)
+        self.assertEqual(2, mock_open.call_count)
         mock_open.assert_any_call(os.path.join(
             path, Checkpoint.INPUT_VOCAB_FILE), ANY)
         mock_open.assert_any_call(os.path.join(
             path, Checkpoint.OUTPUT_VOCAB_FILE), ANY)
-        self.assertEquals(2, mock_dill.dump.call_count)
+        self.assertEqual(2, mock_dill.dump.call_count)
         mock_dill.dump.assert_any_call(mock_vocab,
                                        mock_open.return_value.__enter__.return_value)
 

diff --git a/test/test_evaluator.py b/test/test_evaluator.py
@@ -18,8 +18,8 @@ class TestPredictor(unittest.TestCase):
 
     def setUp(self):
         test_path = os.path.dirname(os.path.realpath(__file__))
-        src = SourceField()
-        tgt = TargetField()
+        src = SourceField(batch_first=True)
+        tgt = TargetField(batch_first=True)
         self.dataset = torchtext.data.TabularDataset(
             path=os.path.join(test_path, 'data/eng-fra.txt'), format='tsv',
             fields=[('src', src), ('tgt', tgt)],

diff --git a/test/test_fields.py b/test/test_fields.py
@@ -9,7 +9,7 @@
 class TestField(unittest.TestCase):
 
     def test_sourcefield(self):
-        field = SourceField()
+        field = SourceField(batch_first=True)
         self.assertTrue(isinstance(field, torchtext.data.Field))
         self.assertTrue(field.batch_first)
         self.assertTrue(field.include_lengths)
@@ -21,7 +21,7 @@ def test_sourcefield_with_wrong_setting(self):
         self.assertTrue(field.include_lengths)
 
     def test_targetfield(self):
-        field = TargetField()
+        field = TargetField(batch_first=True)
         self.assertTrue(isinstance(field, torchtext.data.Field))
         self.assertTrue(field.batch_first)
 
@@ -40,7 +40,7 @@ def test_targetfield_with_other_setting(self):
     def test_targetfield_specials(self):
         test_path = os.path.dirname(os.path.realpath(__file__))
         data_path = os.path.join(test_path, 'data/eng-fra.txt')
-        field = TargetField()
+        field = TargetField(batch_first=True)
         train = torchtext.data.TabularDataset(
             path=data_path, format='tsv',
             fields=[('src', torchtext.data.Field()), ('trg', field)]

diff --git a/test/test_predictor.py b/test/test_predictor.py
@@ -15,8 +15,8 @@ class TestPredictor(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         test_path = os.path.dirname(os.path.realpath(__file__))
-        src = SourceField()
-        trg = TargetField()
+        src = SourceField(batch_first=True)
+        trg = TargetField(batch_first=True)
         dataset = torchtext.data.TabularDataset(
             path=os.path.join(test_path, 'data/eng-fra.txt'), format='tsv',
             fields=[('src', src), ('trg', trg)],

diff --git a/test/test_supervised_trainer.py b/test/test_supervised_trainer.py
@@ -14,8 +14,8 @@ class TestSupervisedTrainer(unittest.TestCase):
 
     def setUp(self):
         test_path = os.path.dirname(os.path.realpath(__file__))
-        src = SourceField()
-        tgt = TargetField()
+        src = SourceField(batch_first=True)
+        tgt = TargetField(batch_first=True)
         self.dataset = torchtext.data.TabularDataset(
             path=os.path.join(test_path, 'data/eng-fra.txt'), format='tsv',
             fields=[('src', src), ('tgt', tgt)],
@@ -87,13 +87,13 @@ def test_loading_optimizer(
         trainer = SupervisedTrainer()
 
         trainer.train(mock_model, self.data_iterator, n_epoches,
-                      resume=True, checkpoint_path='dummy', optimizer='sgd')
+                      resume_training=True, checkpoint_path='dummy', optimizer='sgd')
 
         self.assertFalse(
             sgd.called, "Failed to not call Optimizer() when optimizer should be loaded from checkpoint")
 
         trainer.train(mock_model, self.data_iterator, n_epoches,
-                      resume=False, checkpoint_path='dummy', optimizer='sgd')
+                      resume_training=False, checkpoint_path='dummy', optimizer='sgd')
 
         sgd.assert_called()
 

diff --git a/test_tasks.py b/test_tasks.py
@@ -107,8 +107,8 @@ def init_logging(parameters):
 
 
 def prepare_iters(parameters, train_path, test_paths, valid_path, batch_size, eval_batch_size=512):
-    src = SourceField()
-    tgt = TargetField(include_eos=False)
+    src = SourceField(batch_first=True)
+    tgt = TargetField(include_eos=False, batch_first=True)
     tabular_data_fields = [('src', src), ('tgt', tgt)]
 
     max_len = parameters['max_len']