Skip to content

Commit

Permalink
Add an option to merge on matching column names.
Browse files Browse the repository at this point in the history
  • Loading branch information
kalekundert committed Dec 9, 2019
1 parent f6af979 commit 3ccc294
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 41 deletions.
74 changes: 43 additions & 31 deletions bio96/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,9 @@ def load(toml_path, data_loader=None, merge_cols=None,
(which is the most typical use-case), that data frame will also contain
columns for any data associated with each well.
:param toml_path:
:param str,pathlib.Path toml_path:
The path to a file describing the layout of one or more plates. See the
:doc:`file_format` section for details about this file.
:type toml_path: str or pathlib.Path
:param callable data_loader:
Indicates that `load()` should attempt to load the actual data
Expand All @@ -49,7 +48,7 @@ def load(toml_path, data_loader=None, merge_cols=None,
Note that specifying a data loader implies that **path_required** is
True.
:param dict merge_cols:
:param bool,list merge_cols:
Indicates that `load()` should attempt to merge the plate layout and the
actual data associated with it into a single data frame. This
functionality requires several conditions to be met:
Expand All @@ -67,20 +66,27 @@ def load(toml_path, data_loader=None, merge_cols=None,
__ http://vita.had.co.nz/papers/tidy-data.html
The **merge_cols** argument itself specifies which columns to use when
The **merge_cols** argument specifies which columns to use when
merging the data frames representing the layout and the actual data
(i.e. the two data frames that would be returned if **data_loader** was
specified but **merge_cols** was not) into one. The argument is a
dictionary in which the key-value pairs are the names of columns that
identify wells in the same way between the two data frames.
Each key should be one of the columns from the data frame representing
the layout loaded from the TOML file by ``bio96``. This data frame has
8 columns which identify the wells: ``plate``, ``path``, ``well``,
``well0``, ``row``, ``col``, ``row_i``, ``row_j``. See the "Returns"
section below for more details on the differences between these columns.
Note that the ``path`` column is included in the merge automatically and
never has to be specified.
specified but **merge_cols** was not) into one. The argument can either
be a bool or a dictionary:
If *False* (or falsey, e.g. `None`, `{}`, etc.), the data frames will be
returned separately and not be merged. This is the default behavior.
If *True*, the data frames will be merged using any columns that share
the same name between the two data frames. For example, the layout has
a column named ``well``, so if the actual data also has a column named
``well``, the merge would happen on those columns.
If a dictionary, each key should be one of the columns from the data
frame representing the layout loaded from the TOML file by ``bio96``.
This data frame has 8 columns which identify the wells: ``plate``,
``path``, ``well``, ``well0``, ``row``, ``col``, ``row_i``, ``row_j``.
See the "Returns" section below for more details on the differences
between these columns. Note that the ``path`` column is included in the
merge automatically and never has to be specified.
Each value should be one of the columns from the data frame representing
the actual data. This data frame will have whatever columns were
Expand Down Expand Up @@ -219,24 +225,28 @@ def add_extras(*args):
data = data.append(df, sort=False)

## Merge the layout and the data into a single data frame:
if merge_cols is None:
if not merge_cols:
return add_extras(layout, data)

def check_merge_cols(cols, known_cols, attrs):
unknown_cols = set(cols) - set(known_cols)
if unknown_cols:
raise ValueError(f"Cannot merge on {quoted_join(unknown_cols)}. Allowed {attrs} of the `merge_cols` dict: {quoted_join(known_cols)}.")
return list(cols)

left_ok = 'well', 'well0', 'row', 'col', 'row_i', 'col_i', 'plate'
left_on = check_merge_cols(merge_cols.keys(), left_ok, 'keys')
right_on = check_merge_cols(merge_cols.values(), data.columns, 'values')

merged = pd.merge(
layout, data,
left_on=left_on + ['path'],
right_on=right_on + ['path'],
)
if merge_cols is True:
# Let pandas choose which columns to merge on.
kwargs = {}
else:
def check_merge_cols(cols, known_cols, attrs):
unknown_cols = set(cols) - set(known_cols)
if unknown_cols:
raise ValueError(f"Cannot merge on {quoted_join(unknown_cols)}. Allowed {attrs} of the `merge_cols` dict: {quoted_join(known_cols)}.")
return list(cols)

left_ok = 'well', 'well0', 'row', 'col', 'row_i', 'col_i', 'plate'
kwargs = {
'left_on': ['path'] + check_merge_cols(
merge_cols.keys(), left_ok, 'keys'),
'right_on': ['path'] + check_merge_cols(
merge_cols.values(), data.columns, 'values'),
}

merged = pd.merge(layout, data, **kwargs)
return add_extras(merged)

except ConfigError as err:
Expand Down Expand Up @@ -622,3 +632,5 @@ def user(self):
if k not in self.special.values()
}



2 changes: 1 addition & 1 deletion docs/file_format.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ and are included as columns in the data frame returned by ``load()``.
A block containing any options that apply to the whole layout, but not any
wells in particular. For example, this would be a good place to specify
plotting parameters or data analysis algorithms. The entries in this block
can be anything. Thw whole block is simply parsed into a dictionary and
can be anything. The whole block is simply parsed into a dictionary and
returned by the `load()` function.

``alert``
Expand Down
2 changes: 2 additions & 0 deletions tests/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[pytest]
filterwarnings = ignore::DeprecationWarning
52 changes: 43 additions & 9 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@

DIR = Path(__file__).parent/'toml'

def read_excel_and_rename(path):
return pd.read_excel(path).rename({'Well': 'well'})


def test_one_well():
labels = bio96.load(DIR/'one_well_xy.toml')
assert row(labels, 'well == "A1"') == dict(
Expand Down Expand Up @@ -75,6 +79,24 @@ def test_one_well():
Data='xy',
)

df = bio96.load(
DIR/'one_well_xy.toml',
data_loader=read_excel_and_rename,
merge_cols=True,
path_guess='{0.stem}.xlsx',
)
assert row(df, 'well == "A1"') == dict(
path=DIR/'one_well_xy.xlsx',
well='A1',
Well='A1',
well0='A01',
row='A', col='1',
row_i=0, col_j=0,
x=1,
y=1,
Data='xy',
)

@pytest.mark.parametrize(
"extras_arg,expected", [
('extras', {'a': 1, 'b': 2}),
Expand Down Expand Up @@ -119,15 +141,7 @@ def test_one_well_with_extras(extras_arg, expected):
)

# Merged data
df, extras = bio96.load(
DIR/'one_well_xy_extras.toml',
data_loader=pd.read_excel,
merge_cols={'well': 'Well'},
path_guess='{0.stem}.xlsx',
extras=extras_arg,
)
assert extras == expected
assert row(df, 'well == "A1"') == dict(
a1_expected = dict(
path=DIR/'one_well_xy_extras.xlsx',
well='A1',
Well='A1',
Expand All @@ -139,6 +153,26 @@ def test_one_well_with_extras(extras_arg, expected):
Data='xy',
)

df, extras = bio96.load(
DIR/'one_well_xy_extras.toml',
data_loader=pd.read_excel,
merge_cols={'well': 'Well'},
path_guess='{0.stem}.xlsx',
extras=extras_arg,
)
assert extras == expected
assert row(df, 'well == "A1"') == a1_expected

df, extras = bio96.load(
DIR/'one_well_xy_extras.toml',
data_loader=read_excel_and_rename,
merge_cols=True,
path_guess='{0.stem}.xlsx',
extras=extras_arg,
)
assert extras == expected
assert row(df, 'well == "A1"') == a1_expected

def test_one_plate():
labels = bio96.load(DIR/'one_plate.toml')
assert row(labels, 'well == "A1"') == dict(
Expand Down

0 comments on commit 3ccc294

Please sign in to comment.