Add geom_sina and stat_sina

closes #221
has2k1 · Apr 22, 2019 · 358cea0 · 358cea0
1 parent 3ee8e8e
commit 358cea0
Show file tree

Hide file tree

Showing 13 changed files with 350 additions and 5 deletions.
diff --git a/doc/api.rst b/doc/api.rst
@@ -80,6 +80,7 @@ distinct visual aspects of the representation are controlled by the
    geom_ribbon
    geom_rug
    geom_segment
+   geom_sina
    geom_smooth
    geom_spoke
    geom_step
@@ -152,6 +153,7 @@ with a ``geom`` that can represent all or some of the computations.
    stat_qq
    stat_qq_line
    stat_quantile
+   stat_sina
    stat_smooth
    stat_sum
    stat_summary

diff --git a/doc/changelog.rst b/doc/changelog.rst
@@ -18,6 +18,7 @@ New Features
 - :class:`~plotnine.geoms.geom_text` gained the ``adjust_text`` parameter,
   and can now repel text.
 - Added :class:`~plotnine.annotate.annotation_logticks`.
+- Added :class:`~plotnine.geoms.geom_sina`
 
 Bug Fixes
 *********

diff --git a/plotnine/aes.py b/plotnine/aes.py
@@ -17,6 +17,8 @@
 }
 
 
+NO_GROUP = -1
+
 # Calculated aesthetics searchers
 STAT_RE = re.compile(r'\bstat\(')
 DOTS_RE = re.compile(r'\.\.([a-zA-Z0-9_]+)\.\.')
@@ -424,3 +426,22 @@ def is_valid_aesthetic(value, ae):
     # for special cases to be discovered and then coded
     # for appropriately.
     return False
+
+
+def has_groups(data):
+    """
+    Check if data is grouped
+
+    Parameters
+    ----------
+    data : dataframe
+        Data
+
+    Returns
+    -------
+    out : bool
+        If True, the data has groups.
+    """
+    # If any row in the group column is equal to NO_GROUP, then
+    # the data all of them are and the data has no groups
+    return data.loc[0, 'group'] != NO_GROUP
diff --git a/plotnine/geoms/__init__.py b/plotnine/geoms/__init__.py
@@ -36,6 +36,7 @@
 from .geom_ribbon import geom_ribbon
 from .geom_rug import geom_rug
 from .geom_segment import geom_segment
+from .geom_sina import geom_sina
 from .geom_smooth import geom_smooth
 from .geom_spoke import geom_spoke
 from .geom_step import geom_step
@@ -61,7 +62,7 @@
            'geom_pointrange', 'geom_pointrange',
            'geom_quantile', 'geom_qq', 'geom_qq_line',
            'geom_polygon', 'geom_rect',
-           'geom_ribbon', 'geom_rug', 'geom_segment',
+           'geom_ribbon', 'geom_rug', 'geom_segment', 'geom_sina',
            'geom_smooth', 'geom_spoke', 'geom_step', 'geom_text',
            'geom_tile', 'geom_violin', 'geom_vline',
            # other

diff --git a/plotnine/geoms/geom_sina.py b/plotnine/geoms/geom_sina.py
@@ -0,0 +1,34 @@
+from ..doctools import document
+from .geom_point import geom_point
+
+
+@document
+class geom_sina(geom_point):
+    """
+    Draw a sina plot
+
+    A sina plot is a data visualization chart suitable for plotting
+    any single variable in a multiclass dataset. It is an enhanced
+    jitter strip chart, where the width of the jitter is controlled
+    by the density distribution of the data within each class.
+
+    {usage}
+
+    Parameters
+    ----------
+    {common_parameters}
+
+    See Also
+    --------
+    plotnine.stats.stat_sina
+
+    References
+    ----------
+    Sidiropoulos, N., S. H. Sohi, T. L. Pedersen, B. T. Porse, O. Winther,
+    N. Rapin, and F. O. Bagger. 2018.
+    "SinaPlot: An Enhanced Chart for Simple and Truthful Representation of
+    Single Observations over Multiple Classes."
+    J. Comp. Graph. Stat 27: 673–76.
+    """
+    DEFAULT_PARAMS = {'stat': 'sina', 'position': 'dodge',
+                      'na_rm': False}
diff --git a/plotnine/layer.py b/plotnine/layer.py
@@ -11,7 +11,7 @@
 from .utils import array_kind, ninteraction
 from .utils import check_required_aesthetics, defaults
 from .aes import aes, get_calculated_aes, stat, make_labels
-from .aes import strip_calculated_markers
+from .aes import strip_calculated_markers, NO_GROUP
 
 _TPL_EVAL_FAIL = """\
 Could not evaluate the '{}' mapping: '{}' \
@@ -471,9 +471,6 @@ def finish_statistics(self):
         self.stat.finish_layer(self.data, self.stat.params)
 
 
-NO_GROUP = -1
-
-
 def add_group(data):
     if len(data) == 0:
         return data

diff --git a/plotnine/stats/__init__.py b/plotnine/stats/__init__.py
@@ -13,6 +13,7 @@
 from .stat_qq import stat_qq
 from .stat_qq_line import stat_qq_line
 from .stat_quantile import stat_quantile
+from .stat_sina import stat_sina
 from .stat_smooth import stat_smooth
 from .stat_sum import stat_sum
 from .stat_summary import stat_summary
@@ -30,6 +31,7 @@
            'stat_identity',
            'stat_qq', 'stat_qq_line',
            'stat_quantile',
+           'stat_sina',
            'stat_smooth', 'stat_sum',
            'stat_summary', 'stat_summary_bin',
            'stat_unique', 'stat_ydensity']
diff --git a/plotnine/stats/stat_sina.py b/plotnine/stats/stat_sina.py
@@ -0,0 +1,222 @@
+import numpy as np
+import pandas as pd
+from scipy.interpolate import interp1d
+
+from ..aes import has_groups
+from ..doctools import document
+from ..exceptions import PlotnineError
+from ..utils import array_kind, jitter, resolution
+from .binning import breaks_from_bins, breaks_from_binwidth
+from .stat import stat
+from .stat_density import compute_density
+
+
+@document
+class stat_sina(stat):
+    """
+    Compute Sina plot values
+
+    {usage}
+
+    Parameters
+    ----------
+    {common_parameters}
+    binwidth : float
+        The width of the bins. The default is to use bins that
+        cover the range of the data. You should always override this
+        value, exploring multiple widths to find the best to
+        illustrate the stories in your data.
+    bins : int (default: 50)
+        Number of bins. Overridden by binwidth.
+    method : 'density' or 'counts'
+        Choose the method to spread the samples within the same bin
+        along the x-axis. Available methods: "density", "counts"
+        (can be abbreviated, e.g. "d"). See Details.
+    maxwidth : float
+        Control the maximum width the points can spread into.
+        Values should be in the range (0, 1).
+    adjust : float, optional (default: 1)
+        Adjusts the bandwidth of the density kernel when
+        ``method='density'`` (see density).
+    bw : str or float, optional (default: 'normal_reference')
+        The bandwidth to use, If a float is given, it is the bandwidth.
+        The :py:`str` choices are::
+
+            'normal_reference'
+            'scott'
+            'silverman'
+
+    bin_limit : int (default: 1)
+        If the samples within the same y-axis bin are more
+        than `bin_limit`, the samples's X coordinates will be adjusted.
+        This parameter is effective only when :py:`method='counts'`
+    random_state : int or ~numpy.random.RandomState, optional
+        Seed or Random number generator to use. If ``None``, then
+        numpy global generator :class:`numpy.random` is used.
+    scale : str (default: area)
+        How to scale the sina groups. The options are::
+
+            'area'   # Scale by the largest density/bin amoung the different
+                     # sinas
+
+            'count'  # areas are scaled proportionally to the number of points
+
+            'width'  # Only scale according to the maxwidth parameter.
+
+    See Also
+    --------
+    plotnine.geoms.geom_sina
+    """
+
+    _aesthetics_doc = """
+    {aesthetics_table}
+
+    .. rubric:: Options for computed aesthetics
+
+    ::
+
+         'quantile'  # quantile
+         'group'     # group identifier
+
+    Calculated aesthetics are accessed using the `calc` function.
+    e.g. :py:`'stat(quantile)'`.
+    """
+
+    REQUIRED_AES = {'x', 'y'}
+    DEFAULT_AES = {'xend': 'stat(scaled)'}
+    DEFAULT_PARAMS = {'geom': 'sina', 'position': 'dodge',
+                      'na_rm': False, 'binwidth': None, 'bins': None,
+                      'method': 'density',
+                      'bw': 'normal_reference',
+                      'maxwidth': None, 'adjust': 1, 'bin_limit': 1,
+                      'random_state': None, 'scale': 'area'
+                      }
+    CREATES = {'scaled'}
+
+    def setup_data(self, data):
+        if (array_kind.continuous(data['x']) and
+                not has_groups(data) and
+                (data['x'] != data.loc['x', 0]).any()):
+            raise TypeError("Continuous x aesthetic -- did you forget "
+                            "aes(group=...)?")
+        return data
+
+    def setup_params(self, data):
+        params = self.params.copy()
+        random_state = params['random_state']
+
+        if params['maxwidth'] is None:
+            params['maxwidth'] = resolution(data['x'], False) * 0.9
+
+        if params['binwidth'] is None and self.params['bins'] is None:
+            params['bins'] = 50
+
+        if random_state is None:
+            params['random_state'] = np.random
+        elif isinstance(random_state, int):
+            params['random_state'] = np.random.RandomState(random_state)
+
+        # Required by compute_density
+        params['kernel'] = 'gau'  # It has to be a gaussian kernel
+        params['cut'] = 0
+        params['gridsize'] = None
+        params['clip'] = (-np.inf, np.inf)
+        params['n'] = 512
+        return params
+
+    @classmethod
+    def compute_panel(cls, data, scales, **params):
+        maxwidth = params['maxwidth']
+        random_state = params['random_state']
+
+        if params['binwidth'] is not None:
+            params['bins'] = breaks_from_binwidth(
+                np.array(scales.y.dimension()) + 1e-8,
+                params['binwidth']
+            )
+        else:
+            params['bins'] = breaks_from_bins(
+                np.array(scales.y.dimension()) + 1e-8,
+                params['bins']
+            )
+
+        data = super(cls, stat_sina).compute_panel(data, scales, **params)
+
+        if not len(data):
+            return data
+
+        if params['scale'] == 'area':
+            data['sinawidth'] = data['density']/data['density'].max()
+        elif params['scale'] == 'count':
+            data['sinawidth'] = (data['density'] /
+                                 data['density'].max() *
+                                 data['n']/data['n'].max())
+        elif params['scale'] == 'width':
+            data['sinawidth'] = data['scaled']
+        else:
+            msg = "Unknown scale value '{}'"
+            raise PlotnineError(msg.format(params['scale']))
+
+        data['xmin'] = data['x'] - maxwidth/2
+        data['xmax'] = data['x'] + maxwidth/2
+        data['x_diff'] = (random_state.uniform(-1, 1, len(data)) *
+                          maxwidth *
+                          data['sinawidth']/2
+                          )
+        data['width'] = maxwidth
+
+        # jitter y values if the input is input is integer
+        if (data['y'] == np.floor(data['y'])).all():
+            data['y'] = jitter(data['y'], random_state=random_state)
+
+        return data
+
+    @classmethod
+    def compute_group(cls, data, scales, **params):
+        maxwidth = params['maxwidth']
+        bins = params['bins']
+        bin_limit = params['bin_limit']
+        weight = None
+
+        if len(data) == 0:
+            return pd.DataFrame()
+
+        elif len(data) < 3:
+            data['density'] = 0
+            data['scaled'] = 1
+        elif params['method'] == 'density':
+            # density kernel estimation
+            range_y = data['y'].min(), data['y'].max()
+            dens = compute_density(data['y'], weight, range_y, **params)
+            densf = interp1d(dens['x'], dens['density'],
+                             bounds_error=False, fill_value='extrapolate')
+            data['density'] = densf(data['y'])
+            data['scaled'] = data['density']/dens['density'].max()
+        else:
+            # bin based estimation
+            bin_index = pd.cut(
+                data['y'], bins, include_lowest=True, labels=False)
+            data['density'] = (pd.Series(bin_index)
+                               .groupby(bin_index)
+                               .apply(len)[bin_index]
+                               .values)
+            data.loc[data['density'] <= bin_limit, 'density'] = 0
+            data['scaled'] = data['density']/data['density'].max()
+
+        # Compute width if x has multiple values
+        if len(data['x'].unique()) > 1:
+            width = data['x'].ptp() * maxwidth
+        else:
+            width = maxwidth
+
+        data['width'] = width
+        data['n'] = len(data)
+        data['x'] = np.mean([data['x'].max(), data['x'].min()])
+
+        return data
+
+    def finish_layer(self, data, params):
+        # Rescale x in case positions have been adjusted
+        x_mod = (data['xmax'] - data['xmin']) / data['width']
+        data['x'] = data['x'] + data['x_diff'] * x_mod
+        return data
diff --git a/plotnine/tests/baseline_images/test_geom_sina/method_counts.png b/plotnine/tests/baseline_images/test_geom_sina/method_counts.png
diff --git a/plotnine/tests/baseline_images/test_geom_sina/scale_area+coord_flip.png b/plotnine/tests/baseline_images/test_geom_sina/scale_area+coord_flip.png
diff --git a/plotnine/tests/baseline_images/test_geom_sina/scale_area.png b/plotnine/tests/baseline_images/test_geom_sina/scale_area.png
diff --git a/plotnine/tests/baseline_images/test_geom_sina/scale_count.png b/plotnine/tests/baseline_images/test_geom_sina/scale_count.png