Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding 2d histogram function #5249

Merged
merged 21 commits into from Feb 8, 2019
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions hail/python/hail/docs/plot.rst
Expand Up @@ -21,12 +21,14 @@ Plot functions in Hail accept data in the form of either Python objects or :clas

histogram
cumulative_histogram
histogram2d
scatter
qq
manhattan

.. autofunction:: histogram
.. autofunction:: cumulative_histogram
.. autofunction:: histogram2d
.. autofunction:: scatter
.. autofunction:: qq
.. autofunction:: manhattan
26 changes: 19 additions & 7 deletions hail/python/hail/docs/tutorials/plotting.ipynb
Expand Up @@ -171,6 +171,25 @@
"show(gridplot([p, p2], ncols=2, plot_width=400, plot_height=400))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2-D histogram\n",
"\n",
"For visualizing relationships between variables in large datasets (where scatter plots may be less informative since they highlight outliers), the `histogram_2d()` function will create a heatmap with the number of observations in each section of a 2-d grid based on two variables."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p = hl.plot.histogram2d(pca_scores.scores[0], pca_scores.scores[1])\n",
"show(p)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -228,13 +247,6 @@
"p = hl.plot.manhattan(gwas.p_value, hover_fields=hover_fields, collect_all=True)\n",
"show(p)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
3 changes: 2 additions & 1 deletion hail/python/hail/plot/__init__.py
@@ -1,9 +1,10 @@
from .plots import output_notebook, show, histogram, cumulative_histogram, scatter, qq, manhattan
from .plots import output_notebook, show, histogram, cumulative_histogram, histogram2d, scatter, qq, manhattan

__all__ = ['output_notebook',
'show',
'histogram',
'cumulative_histogram',
'scatter',
'histogram2d',
'qq',
'manhattan']
163 changes: 163 additions & 0 deletions hail/python/hail/plot/plots.py
@@ -1,6 +1,7 @@
from math import log, isnan, log10

import numpy as np
import bokeh
import bokeh.io
from bokeh.models import *
from bokeh.plotting import figure
Expand Down Expand Up @@ -146,6 +147,168 @@ def cumulative_histogram(data, range=None, bins=50, legend=None, title=None, nor
return p


@typecheck(x=expr_numeric, y=expr_numeric, bins=oneof(int, sequenceof(int)),
range=nullable(sized_tupleof(nullable(sized_tupleof(numeric, numeric)),
nullable(sized_tupleof(numeric, numeric)))),
title=nullable(str), width=int, height=int,
font_size=str, colors=sequenceof(str))
def histogram2d(x, y, bins=40, range=None,
title=None, width=600, height=600, font_size='7pt',
colors=bokeh.palettes.all_palettes['Blues'][7][::-1]):
"""Plot a two-dimensional histogram.

``x`` and ``y`` must both be a :class:`NumericExpression` from the same :class:`Table`.

If ``x_range`` or ``y_range`` are not provided, the function will do a pass through the data to determine
min and max of each variable.

Examples
--------

>>> ht = hail.utils.range_table(1000).annotate(x=hail.rand_norm(), y=hail.rand_norm())
>>> p_hist = hail.plot.histogram2d(ht.x, ht.y)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add Examples section illustrating how the parameters are used

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We usually give examples in a section that starts with:

Examples
---------

e.g.
https://hail.is/docs/0.2/methods/impex.html#hail.methods.import_bgen

>>> ht = hail.utils.range_table(1000).annotate(x=hail.rand_norm(), y=hail.rand_norm())
>>> p_hist = hail.plot.histogram2d(ht.x, ht.y, bins=10, range=[[0, 1], None])

Parameters
----------
x : :class:`.NumericExpression`
Expression for x-axis (from a Hail table).
y : :class:`.NumericExpression`
Expression for y-axis (from the same Hail table as ``x``).
bins : int or [int, int]
The bin specification:
- If int, the number of bins for the two dimensions (nx = ny = bins).
- If [int, int], the number of bins in each dimension (nx, ny = bins).
The default value is 40.
range : None or [[float, float], [float, float]]
The leftmost and rightmost edges of the bins along each dimension:
[[xmin, xmax], [ymin, ymax]]. All values outside of this range will be considered outliers
and not tallied in the histogram. If this value is None, or either of the inner lists is None,
the range will be computed from the data.
width : int
Plot width (default 600px).
height : int
Plot height (default 600px).
title : str
Title of the plot.
font_size : str
String of font size in points (default '7pt').
colors : List[str]
List of colors (hex codes, or strings as described
`here <https://bokeh.pydata.org/en/latest/docs/reference/colors.html>`__). Compatible with one of the many
built-in palettes available `here <https://bokeh.pydata.org/en/latest/docs/reference/palettes.html>`__.

Returns
-------
:class:`bokeh.plotting.figure.Figure`
"""
source = x._indices.source
y_source = y._indices.source

if source is None or y_source is None:
raise ValueError("histogram_2d expects two expressions of 'Table', found scalar expression")
if isinstance(source, hail.MatrixTable):
raise ValueError("histogram_2d requires source to be Table, not MatrixTable")
if source != y_source:
raise ValueError(f"histogram_2d expects two expressions from the same 'Table', found {source} and {y_source}")
check_row_indexed('histogram_2d', x)
check_row_indexed('histogram_2d', y)
if isinstance(bins, int):
x_bins = y_bins = bins
else:
x_bins, y_bins = bins
if range is None:
x_range = y_range = None
else:
x_range, y_range = range
if x_range is None or y_range is None:
warnings.warn('At least one range was not defined in histogram_2d. Doing two passes...')
ranges = source.aggregate(hail.struct(x_stats=hail.agg.stats(x),
y_stats=hail.agg.stats(y)))
if x_range is None:
x_range = (ranges.x_stats.min, ranges.x_stats.max)
if y_range is None:
y_range = (ranges.y_stats.min, ranges.y_stats.max)
else:
warnings.warn('If x_range or y_range are specified in histogram_2d, and there are points '
'outside of these ranges, they will not be plotted')
x_range = list(map(float, x_range))
y_range = list(map(float, y_range))
x_spacing = (x_range[1] - x_range[0]) / x_bins
y_spacing = (y_range[1] - y_range[0]) / y_bins

def frange(start, stop, step):
from itertools import count, takewhile
return takewhile(lambda x: x <= stop, count(start, step))

x_levels = hail.literal(list(frange(x_range[0], x_range[1], x_spacing))[::-1])
y_levels = hail.literal(list(frange(y_range[0], y_range[1], y_spacing))[::-1])

grouped_ht = source.group_by(
x=hail.str(x_levels.find(lambda w: x >= w)),
y=hail.str(y_levels.find(lambda w: y >= w))
).aggregate(c=hail.agg.count())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this the right indentation?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My PyCharm claims it is but happy to change if there's another preferred one

data = grouped_ht.filter(hail.is_defined(grouped_ht.x) & (grouped_ht.x != str(x_range[1])) &
hail.is_defined(grouped_ht.y) & (grouped_ht.y != str(y_range[1]))).to_pandas()

mapper = LinearColorMapper(palette=colors, low=data.c.min(), high=data.c.max())

x_axis = sorted(set(data.x), key=lambda z: float(z))
y_axis = sorted(set(data.y), key=lambda z: float(z))
p = figure(title=title,
x_range=x_axis, y_range=y_axis,
x_axis_location="above", plot_width=width, plot_height=height,
tools="hover,save,pan,box_zoom,reset,wheel_zoom", toolbar_location='below')

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_standoff = 0
p.axis.major_label_text_font_size = font_size
import math
p.xaxis.major_label_orientation = math.pi / 3

p.rect(x='x', y='y', width=1, height=1,
source=data,
fill_color={'field': 'c', 'transform': mapper},
line_color=None)

color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size=font_size,
ticker=BasicTicker(desired_num_ticks=6),
label_standoff=6, border_line_color=None, location=(0, 0))
p.add_layout(color_bar, 'right')

def set_font_size(p, font_size: str = '12pt'):
"""Set most of the font sizes in a bokeh figure

Parameters
----------
p : :class:`bokeh.plotting.figure.Figure`
Input figure.
font_size : str
String of font size in points (e.g. '12pt').

Returns
-------
:class:`bokeh.plotting.figure.Figure`
"""
p.title.text_font_size = font_size
p.legend.label_text_font_size = font_size
p.xaxis.axis_label_text_font_size = font_size
p.yaxis.axis_label_text_font_size = font_size
p.xaxis.major_label_text_font_size = font_size
p.yaxis.major_label_text_font_size = font_size
if hasattr(p.xaxis, 'group_text_font_size'):
p.xaxis.group_text_font_size = font_size
return p

p.select_one(HoverTool).tooltips = [('x', '@x'), ('y', '@y',), ('count', '@c')]
p = set_font_size(p, font_size)
return p


@typecheck(x=oneof(sequenceof(numeric), expr_float64), y=oneof(sequenceof(numeric), expr_float64),
label=oneof(nullable(str), expr_str, sequenceof(str)), title=nullable(str),
xlabel=nullable(str), ylabel=nullable(str), size=int, legend=bool,
Expand Down