Add an option to merge on matching column names.

kalekundert · Dec 9, 2019 · 3ccc294 · 3ccc294
1 parent f6af979
commit 3ccc294
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 41 deletions.
diff --git a/bio96/load.py b/bio96/load.py
@@ -36,10 +36,9 @@ def load(toml_path, data_loader=None, merge_cols=None,
     (which is the most typical use-case), that data frame will also contain 
     columns for any data associated with each well.
 
-    :param toml_path:
+    :param str,pathlib.Path toml_path:
       The path to a file describing the layout of one or more plates.  See the 
       :doc:`file_format` section for details about this file.
-    :type toml_path: str or pathlib.Path
 
     :param callable data_loader:
        Indicates that `load()` should attempt to load the actual data 
@@ -49,7 +48,7 @@ def load(toml_path, data_loader=None, merge_cols=None,
        Note that specifying a data loader implies that **path_required** is 
        True.
 
-    :param dict merge_cols:
+    :param bool,list merge_cols:
        Indicates that `load()` should attempt to merge the plate layout and the 
        actual data associated with it into a single data frame.  This 
        functionality requires several conditions to be met:
@@ -67,20 +66,27 @@ def load(toml_path, data_loader=None, merge_cols=None,
        
        __ http://vita.had.co.nz/papers/tidy-data.html
 
-       The **merge_cols** argument itself specifies which columns to use when 
+       The **merge_cols** argument specifies which columns to use when 
        merging the data frames representing the layout and the actual data 
        (i.e. the two data frames that would be returned if **data_loader** was 
-       specified but **merge_cols** was not) into one.  The argument is a 
-       dictionary in which the key-value pairs are the names of columns that 
-       identify wells in the same way between the two data frames.
-
-       Each key should be one of the columns from the data frame representing 
-       the layout loaded from the TOML file by ``bio96``.  This data frame has 
-       8 columns which identify the wells: ``plate``, ``path``, ``well``, 
-       ``well0``, ``row``, ``col``, ``row_i``, ``row_j``.  See the "Returns" 
-       section below for more details on the differences between these columns.  
-       Note that the ``path`` column is included in the merge automatically and 
-       never has to be specified.
+       specified but **merge_cols** was not) into one.  The argument can either 
+       be a bool or a dictionary:
+
+       If *False* (or falsey, e.g. `None`, `{}`, etc.), the data frames will be 
+       returned separately and not be merged.  This is the default behavior.
+
+       If *True*, the data frames will be merged using any columns that share 
+       the same name between the two data frames.  For example, the layout has 
+       a column named ``well``, so if the actual data also has a column named 
+       ``well``, the merge would happen on those columns.
+
+       If a dictionary, each key should be one of the columns from the data 
+       frame representing the layout loaded from the TOML file by ``bio96``.  
+       This data frame has 8 columns which identify the wells: ``plate``, 
+       ``path``, ``well``, ``well0``, ``row``, ``col``, ``row_i``, ``row_j``.  
+       See the "Returns" section below for more details on the differences 
+       between these columns.  Note that the ``path`` column is included in the 
+       merge automatically and never has to be specified.
 
        Each value should be one of the columns from the data frame representing 
        the actual data.  This data frame will have whatever columns were 
@@ -219,24 +225,28 @@ def add_extras(*args):
            data = data.append(df, sort=False)
 
         ## Merge the layout and the data into a single data frame:
-        if merge_cols is None:
+        if not merge_cols:
             return add_extras(layout, data)
 
-        def check_merge_cols(cols, known_cols, attrs):
-            unknown_cols = set(cols) - set(known_cols)
-            if unknown_cols:
-                raise ValueError(f"Cannot merge on {quoted_join(unknown_cols)}.  Allowed {attrs} of the `merge_cols` dict: {quoted_join(known_cols)}.")
-            return list(cols)
-
-        left_ok = 'well', 'well0', 'row', 'col', 'row_i', 'col_i', 'plate'
-        left_on = check_merge_cols(merge_cols.keys(), left_ok, 'keys')
-        right_on = check_merge_cols(merge_cols.values(), data.columns, 'values')
-
-        merged = pd.merge(
-                layout, data,
-                left_on=left_on + ['path'],
-                right_on=right_on + ['path'],
-        )
+        if merge_cols is True:
+            # Let pandas choose which columns to merge on.
+            kwargs = {}
+        else:
+            def check_merge_cols(cols, known_cols, attrs):
+                unknown_cols = set(cols) - set(known_cols)
+                if unknown_cols:
+                    raise ValueError(f"Cannot merge on {quoted_join(unknown_cols)}.  Allowed {attrs} of the `merge_cols` dict: {quoted_join(known_cols)}.")
+                return list(cols)
+
+            left_ok = 'well', 'well0', 'row', 'col', 'row_i', 'col_i', 'plate'
+            kwargs = {
+                    'left_on': ['path'] + check_merge_cols(
+                        merge_cols.keys(), left_ok, 'keys'),
+                    'right_on': ['path'] + check_merge_cols(
+                        merge_cols.values(), data.columns, 'values'),
+            }
+
+        merged = pd.merge(layout, data, **kwargs)
         return add_extras(merged)
 
     except ConfigError as err:
@@ -622,3 +632,5 @@ def user(self):
                 if k not in self.special.values()
         }
 
+
+
diff --git a/docs/file_format.rst b/docs/file_format.rst
@@ -64,7 +64,7 @@ and are included as columns in the data frame returned by ``load()``.
     A block containing any options that apply to the whole layout, but not any 
     wells in particular.  For example, this would be a good place to specify 
     plotting parameters or data analysis algorithms.  The entries in this block 
-    can be anything.  Thw whole block is simply parsed into a dictionary and 
+    can be anything.  The whole block is simply parsed into a dictionary and 
     returned by the `load()` function.
 
   ``alert``

diff --git a/tests/pytest.ini b/tests/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+filterwarnings = ignore::DeprecationWarning
diff --git a/tests/test_load.py b/tests/test_load.py
@@ -10,6 +10,10 @@
 
 DIR = Path(__file__).parent/'toml'
 
+def read_excel_and_rename(path):
+    return pd.read_excel(path).rename({'Well': 'well'})
+
+
 def test_one_well():
     labels = bio96.load(DIR/'one_well_xy.toml')
     assert row(labels, 'well == "A1"') == dict(
@@ -75,6 +79,24 @@ def test_one_well():
             Data='xy',
     )
 
+    df = bio96.load(
+            DIR/'one_well_xy.toml',
+            data_loader=read_excel_and_rename,
+            merge_cols=True,
+            path_guess='{0.stem}.xlsx',
+    )
+    assert row(df, 'well == "A1"') == dict(
+            path=DIR/'one_well_xy.xlsx',
+            well='A1',
+            Well='A1',
+            well0='A01',
+            row='A', col='1',
+            row_i=0, col_j=0,
+            x=1,
+            y=1,
+            Data='xy',
+    )
+
 @pytest.mark.parametrize(
         "extras_arg,expected", [
             ('extras', {'a': 1, 'b': 2}),
@@ -119,15 +141,7 @@ def test_one_well_with_extras(extras_arg, expected):
     )
 
     # Merged data
-    df, extras = bio96.load(
-            DIR/'one_well_xy_extras.toml',
-            data_loader=pd.read_excel,
-            merge_cols={'well': 'Well'},
-            path_guess='{0.stem}.xlsx',
-            extras=extras_arg,
-    )
-    assert extras == expected
-    assert row(df, 'well == "A1"') == dict(
+    a1_expected = dict(
             path=DIR/'one_well_xy_extras.xlsx',
             well='A1',
             Well='A1',
@@ -139,6 +153,26 @@ def test_one_well_with_extras(extras_arg, expected):
             Data='xy',
     )
 
+    df, extras = bio96.load(
+            DIR/'one_well_xy_extras.toml',
+            data_loader=pd.read_excel,
+            merge_cols={'well': 'Well'},
+            path_guess='{0.stem}.xlsx',
+            extras=extras_arg,
+    )
+    assert extras == expected
+    assert row(df, 'well == "A1"') == a1_expected
+
+    df, extras = bio96.load(
+            DIR/'one_well_xy_extras.toml',
+            data_loader=read_excel_and_rename,
+            merge_cols=True,
+            path_guess='{0.stem}.xlsx',
+            extras=extras_arg,
+    )
+    assert extras == expected
+    assert row(df, 'well == "A1"') == a1_expected
+
 def test_one_plate():
     labels = bio96.load(DIR/'one_plate.toml')
     assert row(labels, 'well == "A1"') == dict(