Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes in functions for generation and an example of vertical plot with additional charts #182

Open
wants to merge 24 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
83a15bb
changes in functions generate_samples and generate_counts to allow ge…
ennanco Feb 24, 2022
5dab608
added unitary tests for generate_samples and generate_counts funtions
ennanco Feb 28, 2022
3eb9c65
Repaired problems with some tests
ennanco Mar 2, 2022
e40d6b9
Repaired several examples due to the inclussion of the new generate_s…
ennanco Mar 2, 2022
7f5f918
Change the string format to make it compatible with Python v2
ennanco Mar 2, 2022
e0d9df0
Adding compatibility in generate_samples for python v2
ennanco Mar 2, 2022
4e18668
Adding adaptations to made it retrocompatible with the examples
ennanco Mar 19, 2022
b0c9c7b
Fixing style
ennanco Mar 19, 2022
3374d0d
Fixing test_data.py according to python style sheet
ennanco Mar 21, 2022
9c546e0
Fixing indentation
ennanco Mar 21, 2022
7805d3f
Fixing indentation
ennanco Mar 21, 2022
4cab536
Fixing doctring in generete_counts and changing generate_samples for …
ennanco Mar 22, 2022
b1d0ec6
Fixing spacing style in some comments
ennanco Mar 22, 2022
f64fb19
Adding unitary test for generate_data
ennanco Mar 22, 2022
5443acb
Adding unitary test for generate_data
ennanco Mar 22, 2022
5731bc6
Adding unitary test for generate_data
ennanco Mar 22, 2022
ede49b5
Merge branch 'jnothman:master' into master
ennanco Jun 20, 2022
062e337
Update upsetplot/data.py
ennanco Jan 2, 2023
3d884c4
Update upsetplot/data.py
ennanco Jan 2, 2023
b000f15
Update upsetplot/data.py
ennanco Jan 2, 2023
684be8c
Update upsetplot/data.py
ennanco Jan 2, 2023
ce55bd0
Update examples/plot_vertical.py
ennanco Jan 2, 2023
35ef9bf
Merge branch 'jnothman:master' into master
ennanco Jan 3, 2023
746f679
Merge branch 'jnothman:master' into master
ennanco Jan 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 14 additions & 1 deletion examples/plot_vertical.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"""

from matplotlib import pyplot as plt
from upsetplot import generate_counts, plot
from upsetplot import generate_counts, plot, plotting

example = generate_counts()
plot(example, orientation='vertical')
Expand All @@ -26,3 +26,16 @@
show_percentages=True)
plt.suptitle('With counts and percentages shown')
plt.show()

#########################################################################
# An UpSetplot with additional plots on vertical
# and tuning some visual parameters
example = generate_counts(extra_columns=2)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think using generate_samples here makes more sense? But maybe 10k samples is a lot for three swam plots.

fig = plotting.UpSet(example, orientation='vertical',
show_counts=True, facecolor="grey",
element_size=75)
fig.add_catplot('swarm', 'value', palette='colorblind')
fig.add_catplot('swarm', 'value1', palette='colorblind')
fig.add_catplot('swarm', 'value2', palette='colorblind')
fig.plot()
plt.show()
42 changes: 31 additions & 11 deletions upsetplot/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numpy as np


def generate_samples(seed=0, n_samples=10000, n_categories=3):
def generate_samples(seed=0, n_samples=10000, n_categories=3, extra_columns=0):
"""Generate artificial samples assigned to set intersections

Parameters
Expand All @@ -19,12 +19,16 @@ def generate_samples(seed=0, n_samples=10000, n_categories=3):
Number of samples to generate
n_categories : int
Number of categories (named "cat0", "cat1", ...) to generate
extra_columns : int
If a vector is required,this would indicated the number of additional
columns (named "value", "value1", "value2", ... )

Returns
-------
DataFrame
Field 'value' is a weight or score for each element.
Field 'index' is a unique id for each element.
Field(s) 'value{i}' additional values for multiple-feature samples
Index includes a boolean indicator mask for each category.

Note: Further fields may be added in future versions.
Expand All @@ -34,19 +38,25 @@ def generate_samples(seed=0, n_samples=10000, n_categories=3):
generate_counts : Generates the counts for each subset of categories
corresponding to these samples.
"""
assert extra_columns >= 0, 'extra_columns parameter should be possitive'
rng = np.random.RandomState(seed)
df = pd.DataFrame({'value': np.zeros(n_samples)})
len_samples = 1 + extra_columns
df = pd.DataFrame(np.zeros((n_samples, len_samples)))
valuename_lst = [f'value{i}' if i > 0 else 'value' for i in
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just call this variable columns or column_names?

range(len_samples)]
df.columns = valuename_lst

for i in range(n_categories):
r = rng.rand(n_samples)
df['cat%d' % i] = r > rng.rand()
df['value'] += r
r = rng.rand(n_samples, len_samples)
df[f'cat{i}'] = r[:, 0] > rng.rand()
Comment on lines +50 to +51
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This puzzles me. We're only using the first column of a random matrix of values, and extra_columns is unused.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't worry about making the values correlate with the categories. Just put in the docstring that the extra column values may change in a future version so we have licence to do it later.

df[valuename_lst] += r

df.reset_index(inplace=True)
df.set_index(['cat%d' % i for i in range(n_categories)], inplace=True)
df.set_index([f'cat{i}' for i in range(n_categories)], inplace=True)
return df


def generate_counts(seed=0, n_samples=10000, n_categories=3):
def generate_counts(seed=0, n_samples=10000, n_categories=3, extra_columns=0):
"""Generate artificial counts corresponding to set intersections

Parameters
Expand All @@ -57,20 +67,30 @@ def generate_counts(seed=0, n_samples=10000, n_categories=3):
Number of samples to generate statistics over
n_categories : int
Number of categories (named "cat0", "cat1", ...) to generate
extra_columns: int
Number of additional features to be use to generate each
sample (value, value1, value2, ...)

Returns
-------
Series
Counts indexed by boolean indicator mask for each category.
Series or DataFrame
A Series of counts indexed by boolean indicator mask for each category,
when ``extra_columns`` is 0. Otherwise a DataFrame with column ``value``
equivalent to the value produced when ``extra_columns`` is 0, as well as
further random variables ``value1``, ``value2``, for extra columns.

See Also
--------
generate_samples : Generates a DataFrame of samples that these counts are
derived from.
"""
assert extra_columns >= 0, 'extra_columns parameter should be possitive'
df = generate_samples(seed=seed, n_samples=n_samples,
n_categories=n_categories)
return df.value.groupby(level=list(range(n_categories))).count()
n_categories=n_categories,
extra_columns=extra_columns)
df.drop('index', axis=1, inplace=True)
df = df if extra_columns > 0 else df.value
return df.groupby(level=list(range(n_categories))).count()
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think counting is meaningful for the extra columns. Maybe we should use a different aggregate?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or maybe we shouldn't offer this functionality in generate_counts, making things somewhat simpler.



def generate_data(seed=0, n_samples=10000, n_sets=3, aggregated=False):
Expand Down
80 changes: 75 additions & 5 deletions upsetplot/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
import pandas as pd
import numpy as np
from distutils.version import LooseVersion
from pandas.util.testing import (assert_series_equal, assert_frame_equal,
assert_index_equal)
from pandas.testing import (assert_series_equal, assert_frame_equal,
assert_index_equal)
from upsetplot import (from_memberships, from_contents, from_indicators,
generate_data)
from upsetplot.data import (generate_samples, generate_counts)


@pytest.mark.parametrize('typ', [set, list, tuple, iter])
Expand Down Expand Up @@ -207,6 +208,75 @@ def test_from_indicators_equivalence(indicators, data):
from_memberships([[], ["cat1"], []], data))


def test_generate_data_warning():
with pytest.warns(DeprecationWarning):
generate_data()
class TestGenerateData:
def test_generate_data_warning(self):
'''
Check the warning araised by the function
'''
with pytest.warns(DeprecationWarning):
generate_data()

def test_generate_default(self):
'''
Check that the generated data by default, fullfills the
correct dimensions of the data
'''
result = generate_data()
assert len(result.index[0]) == 3
assert result.shape == (10_000,)

def test_generate_samples_reproductibility(self):
'''
This test explores the reproducibility of the results
when a random seed has been set
'''
import numpy as np
seed = np.random.randint(0, 100)
assert generate_samples(seed=seed).equals(generate_samples(seed=seed))

@pytest.mark.parametrize("n_samples", [100, 1_000, 10_000])
@pytest.mark.parametrize("n_categories", [1, 3])
@pytest.mark.parametrize("extra_columns", [0, 2])
def test_generate_samples_shapes(self, n_samples, n_categories,
extra_columns):
'''
Check the generations of different sample sizes with different
arguments
NOTICE: the generate_samples funcition has one extra
column due to index, unless it is unused and it is removed
'''
result = generate_samples(n_samples=n_samples,
n_categories=n_categories,
extra_columns=extra_columns)

if type(result.index[0]) is tuple:
assert len(result.index[0]) == n_categories
else:
assert result.index.is_boolean()

assert result.shape == (n_samples, extra_columns + 2)

@pytest.mark.parametrize("n_samples", [100, 1_000, 10_000])
@pytest.mark.parametrize("extra_columns", [0, 2])
def test_generate_counts(self, n_samples, extra_columns):
'''
Test of the function generate_counts
which internally uses generate_samples
'''
result = generate_counts(n_samples=n_samples,
extra_columns=extra_columns)
if extra_columns:
assert len(result.columns) == extra_columns + 1
assert (result.sum(axis=0) == n_samples).all()

@pytest.mark.parametrize("aggregated", [True, False])
def test_generate_data(self, aggregated):
'''
Test the return of the deprecated method
generate_data
'''
data = generate_data(aggregated=aggregated)
if aggregated:
assert data.equals(generate_counts())
else:
assert data.equals(generate_samples().value)
1 change: 0 additions & 1 deletion upsetplot/tests/test_upsetplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def get_all_texts(mpl_artist):
'sort_categories_by',
[None, 'input', '-input', 'cardinality', '-cardinality'])
def test_process_data_series(x, sort_by, sort_categories_by):
assert x.name == 'value'
for subset_size in ['auto', 'sum', 'count']:
for sum_over in ['abc', False]:
with pytest.raises(ValueError, match='sum_over is not applicable'):
Expand Down