Skip to content

Commit

Permalink
Allow specification of subset sizes as percentage (#264)
Browse files Browse the repository at this point in the history
  • Loading branch information
jnothman committed Dec 29, 2023
1 parent da64c81 commit 1573a90
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 7 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ What's new in version 0.9
shading of rows in the intersection matrix, and bars in the totals plot.
(:issue:`261` with thanks to :user:`Marcel Albus <maralbus>`).
- Ability to disable totals plot with `totals_plot_elements=0`. (:issue:`246`)
- Ability to set totals y axis label (:issue:`243`)
- Added ``max_subset_rank`` to get only n most populous subsets. (:issue:`253`)
- Added support for ``min_subset_size`` and ``max_subset_size`` specified as
percentage. (:issue:`264`)

What's new in version 0.8
-------------------------
Expand Down
26 changes: 22 additions & 4 deletions upsetplot/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,17 +191,27 @@ class UpSet:
If `subset_size='sum'` or `'auto'`, then the intersection size is the
sum of the specified field in the `data` DataFrame. If a Series, only
None is supported and its value is summed.
min_subset_size : int, optional
min_subset_size : int or "number%", optional
Minimum size of a subset to be shown in the plot. All subsets with
a size smaller than this threshold will be omitted from plotting.
This may be specified as a percentage
using a string, like "50%".
Size may be a sum of values, see `subset_size`.
.. versionadded:: 0.5
max_subset_size : int, optional
.. versionchanged:: 0.9
Support percentages
max_subset_size : int or "number%", optional
Maximum size of a subset to be shown in the plot. All subsets with
a size greater than this threshold will be omitted from plotting.
This may be specified as a percentage
using a string, like "50%".
.. versionadded:: 0.5
.. versionchanged:: 0.9
Support percentages
max_subset_rank : int, optional
Limit to the top N ranked subsets in descending order of size.
All tied subsets are included.
Expand Down Expand Up @@ -379,10 +389,18 @@ def style_subsets(
absent : str or list of str, optional
Category or categories that must not be present in subsets for
styling.
min_subset_size : int, optional
min_subset_size : int or "number%", optional
Minimum size of a subset to be styled.
max_subset_size : int, optional
This may be specified as a percentage using a string, like "50%".
.. versionchanged:: 0.9
Support percentages
max_subset_size : int or "number%", optional
Maximum size of a subset to be styled.
This may be specified as a percentage using a string, like "50%".
.. versionchanged:: 0.9
Support percentages
max_subset_rank : int, optional
Limit to the top N ranked subsets in descending order of size.
All tied subsets are included.
Expand Down
28 changes: 25 additions & 3 deletions upsetplot/reformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,19 @@ def _scalar_to_list(val):
return val


def _check_percent(value, agg):
if not isinstance(value, str):
return value
try:
if value.endswith("%") and 0 <= float(value[:-1]) <= 100:
return float(value[:-1]) / 100 * agg.sum()
except ValueError:
pass
raise ValueError(
f"String value must be formatted as percentage between 0 and 100. Got {value}"
)


def _get_subset_mask(
agg,
min_subset_size,
Expand All @@ -104,6 +117,8 @@ def _get_subset_mask(
absent,
):
"""Get a mask over subsets based on size, degree or category presence"""
min_subset_size = _check_percent(min_subset_size, agg)
max_subset_size = _check_percent(max_subset_size, agg)
subset_mask = True
if min_subset_size is not None:
subset_mask = np.logical_and(subset_mask, agg >= min_subset_size)
Expand Down Expand Up @@ -235,13 +250,20 @@ def query(
absent : str or list of str, optional
Category or categories that must not be present in subsets for
styling.
min_subset_size : int, optional
min_subset_size : int or "number%", optional
Minimum size of a subset to be reported. All subsets with
a size smaller than this threshold will be omitted from
category_totals and data.
category_totals and data. This may be specified as a percentage
using a string, like "50%".
Size may be a sum of values, see `subset_size`.
max_subset_size : int, optional
.. versionchanged:: 0.9
Support percentages
max_subset_size : int or "number%", optional
Maximum size of a subset to be reported.
.. versionchanged:: 0.9
Support percentages
max_subset_rank : int, optional
Limit to the top N ranked subsets in descending order of size.
All tied subsets are included.
Expand Down
23 changes: 23 additions & 0 deletions upsetplot/tests/test_upsetplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,13 @@ def test_index_must_be_bool(x):
(True, True, True): 990,
},
),
(
{"min_subset_size": "15%", "max_subset_size": "30.1%"},
{
(True, False, False): 884,
(True, True, True): 990,
},
),
(
{"min_degree": 2},
{
Expand Down Expand Up @@ -853,6 +860,22 @@ def test_filter_subsets_max_subset_rank_tie():
assert cur.shape[0] == full.shape[0]


@pytest.mark.parametrize(
"value",
[
"1",
"-1%",
"1%%",
"%1",
"hello",
],
)
def test_bad_percentages(value):
data = generate_samples(seed=0, n_samples=5, n_categories=3)
with pytest.raises(ValueError, match="percentage"):
UpSet(data, min_subset_size=value)


@pytest.mark.parametrize(
"x",
[
Expand Down

0 comments on commit 1573a90

Please sign in to comment.