automatically load Crop info from disk if available

jcmgray · Jun 19, 2019 · 1eb2b1d · 1eb2b1d
1 parent d31363c
commit 1eb2b1d
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 10 deletions.
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -14,6 +14,7 @@ v0.3.2 (Unreleased)
 - Add PBS support to :meth:`xyzpy.Crop.qsub_grow` for distributed crop growing
 - Add :func:`xyzpy.save_merge_ds` for manually aggregating datasets to disk
 - Add ``allow_incomplete=True`` option to :meth:`xyzpy.Crop.reap` for gathering data even if the crop is not fully grown (:issue:`7` )
+- Make new :class:`~xyzpy.Crop` instances by default automatically load information from disk if they have been already prepared/sown (:issue:`7` )
 
 
 .. _whats-new.0.3.1:

diff --git a/tests/test_gen/test_batch.py b/tests/test_gen/test_batch.py
@@ -6,7 +6,7 @@
 import xarray as xr
 from numpy.testing import assert_allclose
 
-from xyzpy import combo_runner, combo_runner_to_ds
+from xyzpy import combo_runner, combo_runner_to_ds, Runner, Harvester
 from xyzpy.gen.batch import (
     XYZError,
     Crop,
@@ -288,3 +288,37 @@ def test_reap_to_ds_allow_incomplete(self, fn, var_names, var_dims):
             ds = crop.reap_combos_to_ds(var_names=var_names, var_dims=var_dims,
                                         allow_incomplete=True)
             assert ds.identical(ds_exp)
+
+    def test_new_ds_crop_loads_info_incomplete(self):
+        def fn(a, b):
+            return xr.Dataset({'sum': a + b, 'diff': a - b})
+
+        with TemporaryDirectory() as tdir:
+            disk_ds = os.path.join(tdir, 'test.h5')
+
+            combos = dict(a=[1], b=[1, 2, 3])
+            runner = Runner(fn, var_names=None)
+            harvester = Harvester(runner, disk_ds)
+            crop = harvester.Crop(name='fn', batchsize=1, parent_dir=tdir)
+            crop.sow_combos(combos)
+            for i in range(1, 3):
+                crop.grow(i)
+
+            # try creating crop from fresh
+            c = Crop(name='fn', parent_dir=tdir)
+            # crop's harvester should be loaded from disk
+            assert c.harvester is not None
+            assert c.harvester is not harvester
+            ds = c.reap(allow_incomplete=True)
+            assert isinstance(ds, xr.Dataset)
+            assert ds['diff'].isnull().sum() == 1
+            assert harvester.full_ds['diff'].isnull().sum() == 1
+
+            # try creating crop from harvester
+            c = harvester.Crop('fn', parent_dir=tdir)
+            # crop's harvester should still be harvester
+            assert c.harvester is not None
+            assert c.harvester is harvester
+            ds = c.reap(allow_incomplete=True)
+            assert isinstance(ds, xr.Dataset)
+            assert ds['diff'].isnull().sum() == 1
diff --git a/xyzpy/gen/batch.py b/xyzpy/gen/batch.py
@@ -141,6 +141,15 @@ def calc_clean_up_default_res(crop, clean_up, allow_incomplete):
     return clean_up, default_result
 
 
+def check_ready_to_reap(crop, allow_incomplete, wait):
+    if not (allow_incomplete or wait or crop.is_ready_to_reap()):
+        raise XYZError("This crop is not ready to reap yet - results are "
+                       "missing. You can reap only finished batches by setting"
+                       " ``allow_incomplete=True``, but be aware this will "
+                       "represent all missing batches with ``np.nan`` and thus"
+                       " might effect data-types.")
+
+
 class Crop(object):
     """Encapsulates all the details describing a single 'crop', that is,
     its location, name, and batch size/number. Also allows tracking of
@@ -214,6 +223,10 @@ def __init__(self, *,
         self.location, self.name, self.parent_dir = \
             parse_crop_details(self._fn, self.name, self.parent_dir)
 
+        # try loading crop information if it exists
+        if autoload and self.is_prepared():
+            self._sync_info_from_disk()
+
         # Save function so it can be automatically loaded with all deps?
         if (fn is None) and (save_fn is True):
             raise ValueError("Must specify a function for it to be saved!")
@@ -301,7 +314,7 @@ def load_info(self):
         else:
             return joblib.load(sfile)
 
-    def _sync_info_from_disk(self):
+    def _sync_info_from_disk(self, only_missing=True):
         """Load information about the saved cases.
         """
         settings = self.load_info()
@@ -314,10 +327,18 @@ def _sync_info_from_disk(self):
         runner_pkl = settings['runner']
         runner = None if runner_pkl is None else pickle.loads(runner_pkl)
 
-        self._fn, self.runner, self.harvester = \
+        fn, runner, harvester = \
             parse_fn_runner_harvester(None, runner, harvester)
 
-        self.load_function()
+        # if crop already has a harvester/runner. (e.g. was instantiated from
+        # one) by default don't overwrite from disk
+        if (self.runner is None) or (not only_missing):
+            self.runner = runner
+        if (self.harvester) is None or (not only_missing):
+            self.harvester = harvester
+
+        if self.fn is None:
+            self.load_function()
 
     def save_function_to_disk(self):
         """Save the base function to disk using cloudpickle
@@ -510,9 +531,7 @@ def reap_combos(self, wait=False, clean_up=None, allow_incomplete=False):
         results : nested tuple
             'N-dimensional' tuple containing the results.
         """
-        if not (allow_incomplete or wait or self.is_ready_to_reap()):
-            raise XYZError("This crop is not ready to reap "
-                           "yet - results are missing.")
+        check_ready_to_reap(self, allow_incomplete, wait)
 
         clean_up, default_result = calc_clean_up_default_res(
             self, clean_up, allow_incomplete
@@ -579,9 +598,7 @@ def reap_combos_to_ds(self,
         xarray.Dataset
             Multidimensional labelled dataset contatining all the results.
         """
-        if not (allow_incomplete or wait or self.is_ready_to_reap()):
-            raise XYZError("This crop is not ready to reap "
-                           "yet - results are missing.")
+        check_ready_to_reap(self, allow_incomplete, wait)
 
         clean_up, default_result = calc_clean_up_default_res(
             self, clean_up, allow_incomplete