Skip to content

Commit

Permalink
Add geom_sina and stat_sina
Browse files Browse the repository at this point in the history
closes #221
  • Loading branch information
has2k1 committed Apr 22, 2019
1 parent 3ee8e8e commit 358cea0
Show file tree
Hide file tree
Showing 13 changed files with 350 additions and 5 deletions.
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ distinct visual aspects of the representation are controlled by the
geom_ribbon
geom_rug
geom_segment
geom_sina
geom_smooth
geom_spoke
geom_step
Expand Down Expand Up @@ -152,6 +153,7 @@ with a ``geom`` that can represent all or some of the computations.
stat_qq
stat_qq_line
stat_quantile
stat_sina
stat_smooth
stat_sum
stat_summary
Expand Down
1 change: 1 addition & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ New Features
- :class:`~plotnine.geoms.geom_text` gained the ``adjust_text`` parameter,
and can now repel text.
- Added :class:`~plotnine.annotate.annotation_logticks`.
- Added :class:`~plotnine.geoms.geom_sina`

Bug Fixes
*********
Expand Down
21 changes: 21 additions & 0 deletions plotnine/aes.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
}


NO_GROUP = -1

# Calculated aesthetics searchers
STAT_RE = re.compile(r'\bstat\(')
DOTS_RE = re.compile(r'\.\.([a-zA-Z0-9_]+)\.\.')
Expand Down Expand Up @@ -424,3 +426,22 @@ def is_valid_aesthetic(value, ae):
# for special cases to be discovered and then coded
# for appropriately.
return False


def has_groups(data):
"""
Check if data is grouped
Parameters
----------
data : dataframe
Data
Returns
-------
out : bool
If True, the data has groups.
"""
# If any row in the group column is equal to NO_GROUP, then
# the data all of them are and the data has no groups
return data.loc[0, 'group'] != NO_GROUP
3 changes: 2 additions & 1 deletion plotnine/geoms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from .geom_ribbon import geom_ribbon
from .geom_rug import geom_rug
from .geom_segment import geom_segment
from .geom_sina import geom_sina
from .geom_smooth import geom_smooth
from .geom_spoke import geom_spoke
from .geom_step import geom_step
Expand All @@ -61,7 +62,7 @@
'geom_pointrange', 'geom_pointrange',
'geom_quantile', 'geom_qq', 'geom_qq_line',
'geom_polygon', 'geom_rect',
'geom_ribbon', 'geom_rug', 'geom_segment',
'geom_ribbon', 'geom_rug', 'geom_segment', 'geom_sina',
'geom_smooth', 'geom_spoke', 'geom_step', 'geom_text',
'geom_tile', 'geom_violin', 'geom_vline',
# other
Expand Down
34 changes: 34 additions & 0 deletions plotnine/geoms/geom_sina.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from ..doctools import document
from .geom_point import geom_point


@document
class geom_sina(geom_point):
"""
Draw a sina plot
A sina plot is a data visualization chart suitable for plotting
any single variable in a multiclass dataset. It is an enhanced
jitter strip chart, where the width of the jitter is controlled
by the density distribution of the data within each class.
{usage}
Parameters
----------
{common_parameters}
See Also
--------
plotnine.stats.stat_sina
References
----------
Sidiropoulos, N., S. H. Sohi, T. L. Pedersen, B. T. Porse, O. Winther,
N. Rapin, and F. O. Bagger. 2018.
"SinaPlot: An Enhanced Chart for Simple and Truthful Representation of
Single Observations over Multiple Classes."
J. Comp. Graph. Stat 27: 673–76.
"""
DEFAULT_PARAMS = {'stat': 'sina', 'position': 'dodge',
'na_rm': False}
5 changes: 1 addition & 4 deletions plotnine/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .utils import array_kind, ninteraction
from .utils import check_required_aesthetics, defaults
from .aes import aes, get_calculated_aes, stat, make_labels
from .aes import strip_calculated_markers
from .aes import strip_calculated_markers, NO_GROUP

_TPL_EVAL_FAIL = """\
Could not evaluate the '{}' mapping: '{}' \
Expand Down Expand Up @@ -471,9 +471,6 @@ def finish_statistics(self):
self.stat.finish_layer(self.data, self.stat.params)


NO_GROUP = -1


def add_group(data):
if len(data) == 0:
return data
Expand Down
2 changes: 2 additions & 0 deletions plotnine/stats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .stat_qq import stat_qq
from .stat_qq_line import stat_qq_line
from .stat_quantile import stat_quantile
from .stat_sina import stat_sina
from .stat_smooth import stat_smooth
from .stat_sum import stat_sum
from .stat_summary import stat_summary
Expand All @@ -30,6 +31,7 @@
'stat_identity',
'stat_qq', 'stat_qq_line',
'stat_quantile',
'stat_sina',
'stat_smooth', 'stat_sum',
'stat_summary', 'stat_summary_bin',
'stat_unique', 'stat_ydensity']
222 changes: 222 additions & 0 deletions plotnine/stats/stat_sina.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d

from ..aes import has_groups
from ..doctools import document
from ..exceptions import PlotnineError
from ..utils import array_kind, jitter, resolution
from .binning import breaks_from_bins, breaks_from_binwidth
from .stat import stat
from .stat_density import compute_density


@document
class stat_sina(stat):
"""
Compute Sina plot values
{usage}
Parameters
----------
{common_parameters}
binwidth : float
The width of the bins. The default is to use bins that
cover the range of the data. You should always override this
value, exploring multiple widths to find the best to
illustrate the stories in your data.
bins : int (default: 50)
Number of bins. Overridden by binwidth.
method : 'density' or 'counts'
Choose the method to spread the samples within the same bin
along the x-axis. Available methods: "density", "counts"
(can be abbreviated, e.g. "d"). See Details.
maxwidth : float
Control the maximum width the points can spread into.
Values should be in the range (0, 1).
adjust : float, optional (default: 1)
Adjusts the bandwidth of the density kernel when
``method='density'`` (see density).
bw : str or float, optional (default: 'normal_reference')
The bandwidth to use, If a float is given, it is the bandwidth.
The :py:`str` choices are::
'normal_reference'
'scott'
'silverman'
bin_limit : int (default: 1)
If the samples within the same y-axis bin are more
than `bin_limit`, the samples's X coordinates will be adjusted.
This parameter is effective only when :py:`method='counts'`
random_state : int or ~numpy.random.RandomState, optional
Seed or Random number generator to use. If ``None``, then
numpy global generator :class:`numpy.random` is used.
scale : str (default: area)
How to scale the sina groups. The options are::
'area' # Scale by the largest density/bin amoung the different
# sinas
'count' # areas are scaled proportionally to the number of points
'width' # Only scale according to the maxwidth parameter.
See Also
--------
plotnine.geoms.geom_sina
"""

_aesthetics_doc = """
{aesthetics_table}
.. rubric:: Options for computed aesthetics
::
'quantile' # quantile
'group' # group identifier
Calculated aesthetics are accessed using the `calc` function.
e.g. :py:`'stat(quantile)'`.
"""

REQUIRED_AES = {'x', 'y'}
DEFAULT_AES = {'xend': 'stat(scaled)'}
DEFAULT_PARAMS = {'geom': 'sina', 'position': 'dodge',
'na_rm': False, 'binwidth': None, 'bins': None,
'method': 'density',
'bw': 'normal_reference',
'maxwidth': None, 'adjust': 1, 'bin_limit': 1,
'random_state': None, 'scale': 'area'
}
CREATES = {'scaled'}

def setup_data(self, data):
if (array_kind.continuous(data['x']) and
not has_groups(data) and
(data['x'] != data.loc['x', 0]).any()):
raise TypeError("Continuous x aesthetic -- did you forget "
"aes(group=...)?")
return data

def setup_params(self, data):
params = self.params.copy()
random_state = params['random_state']

if params['maxwidth'] is None:
params['maxwidth'] = resolution(data['x'], False) * 0.9

if params['binwidth'] is None and self.params['bins'] is None:
params['bins'] = 50

if random_state is None:
params['random_state'] = np.random
elif isinstance(random_state, int):
params['random_state'] = np.random.RandomState(random_state)

# Required by compute_density
params['kernel'] = 'gau' # It has to be a gaussian kernel
params['cut'] = 0
params['gridsize'] = None
params['clip'] = (-np.inf, np.inf)
params['n'] = 512
return params

@classmethod
def compute_panel(cls, data, scales, **params):
maxwidth = params['maxwidth']
random_state = params['random_state']

if params['binwidth'] is not None:
params['bins'] = breaks_from_binwidth(
np.array(scales.y.dimension()) + 1e-8,
params['binwidth']
)
else:
params['bins'] = breaks_from_bins(
np.array(scales.y.dimension()) + 1e-8,
params['bins']
)

data = super(cls, stat_sina).compute_panel(data, scales, **params)

if not len(data):
return data

if params['scale'] == 'area':
data['sinawidth'] = data['density']/data['density'].max()
elif params['scale'] == 'count':
data['sinawidth'] = (data['density'] /
data['density'].max() *
data['n']/data['n'].max())
elif params['scale'] == 'width':
data['sinawidth'] = data['scaled']
else:
msg = "Unknown scale value '{}'"
raise PlotnineError(msg.format(params['scale']))

data['xmin'] = data['x'] - maxwidth/2
data['xmax'] = data['x'] + maxwidth/2
data['x_diff'] = (random_state.uniform(-1, 1, len(data)) *
maxwidth *
data['sinawidth']/2
)
data['width'] = maxwidth

# jitter y values if the input is input is integer
if (data['y'] == np.floor(data['y'])).all():
data['y'] = jitter(data['y'], random_state=random_state)

return data

@classmethod
def compute_group(cls, data, scales, **params):
maxwidth = params['maxwidth']
bins = params['bins']
bin_limit = params['bin_limit']
weight = None

if len(data) == 0:
return pd.DataFrame()

elif len(data) < 3:
data['density'] = 0
data['scaled'] = 1
elif params['method'] == 'density':
# density kernel estimation
range_y = data['y'].min(), data['y'].max()
dens = compute_density(data['y'], weight, range_y, **params)
densf = interp1d(dens['x'], dens['density'],
bounds_error=False, fill_value='extrapolate')
data['density'] = densf(data['y'])
data['scaled'] = data['density']/dens['density'].max()
else:
# bin based estimation
bin_index = pd.cut(
data['y'], bins, include_lowest=True, labels=False)
data['density'] = (pd.Series(bin_index)
.groupby(bin_index)
.apply(len)[bin_index]
.values)
data.loc[data['density'] <= bin_limit, 'density'] = 0
data['scaled'] = data['density']/data['density'].max()

# Compute width if x has multiple values
if len(data['x'].unique()) > 1:
width = data['x'].ptp() * maxwidth
else:
width = maxwidth

data['width'] = width
data['n'] = len(data)
data['x'] = np.mean([data['x'].max(), data['x'].min()])

return data

def finish_layer(self, data, params):
# Rescale x in case positions have been adjusted
x_mod = (data['xmax'] - data['xmin']) / data['width']
data['x'] = data['x'] + data['x_diff'] * x_mod
return data
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 358cea0

Please sign in to comment.