Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[query] add less biased unsmoothed pdf to ggplot #13608

Merged
merged 12 commits into from Sep 21, 2023
196 changes: 165 additions & 31 deletions hail/python/hail/ggplot/geoms.py
@@ -1,5 +1,6 @@
from typing import Dict, Any, Optional
import abc
import math
import numpy as np
import plotly.graph_objects as go

Expand Down Expand Up @@ -484,6 +485,102 @@ def geom_histogram(mapping=aes(), *, min_val=None, max_val=None, bins=None, fill
return GeomHistogram(mapping, min_val=min_val, max_val=max_val, bins=bins, fill=fill, color=color, alpha=alpha, position=position, size=size)


def _max_entropy_cdf(min_x, max_x, x, y, e):
ehigham marked this conversation as resolved.
Show resolved Hide resolved
def compare(x1, y1, x2, y2):
return x1 * y2 - x2 * y1

new_y = np.full_like(x, 0.0, dtype=np.float64)
keep = np.full_like(x, False, dtype=np.bool_)

fx = min_x # fixed x
fy = 0 # fixed y
li = 0 # index of lower slope
ui = 0 # index of upper slope
ldx = x[li] - fx
udx = x[ui] - fx
ldy = y[li + 1] - e - fy
udy = y[ui] + e - fy
j = 1
while ui < len(x) and li < len(x):
if j == len(x):
ub = 1
lb = 1
xj = max_x
else:
ub = y[j] + e
lb = y[j + 1] - e
xj = x[j]
dx = xj - fx
judy = ub - fy
jldy = lb - fy
if compare(ldx, ldy, dx, judy) < 0:
# line must bend down at j
fx = x[li]
fy = y[li + 1] - e
new_y[li] = fy
keep[li] = True
j = li + 1
if j >= len(x):
break
li = j
ldx = x[li] - fx
ldy = y[li + 1] - e - fy
ui = j
udx = x[ui] - fx
udy = y[ui] + e - fy
j += 1
continue
elif compare(udx, udy, dx, jldy) > 0:
# line must bend up at j
fx = x[ui]
fy = y[ui] + e
new_y[ui] = fy
keep[ui] = True
j = ui + 1
if j >= len(x):
break
li = j
ldx = x[li] - fx
ldy = y[li + 1] - e - fy
ui = j
udx = x[ui] - fx
udy = y[ui] + e - fy
j += 1
continue
if j >= len(x):
break
if compare(udx, udy, dx, judy) < 0:
ui = j
udx = x[ui] - fx
udy = y[ui] + e - fy
if compare(ldx, ldy, dx, jldy) > 0:
li = j
ldx = x[li] - fx
ldy = y[li + 1] - e - fy
j += 1
return new_y, keep


def _cdf_error(cdf, failure_prob):
s = 0
for i in range(len(cdf._compaction_counts)):
s += cdf._compaction_counts[i] << (2 * i)
s = s / (cdf.ranks[-1] ** 2)

if s == 0:
# no compactions ergo no error
return 0

def update_grid_size(p):
return 4 * math.sqrt(math.log(2 * p / failure_prob) / (2 * s))

p = 1 / failure_prob
for i in range(5):
p = update_grid_size(p)

return 1 / p + math.sqrt(math.log(2 * p / failure_prob) * s / 2)


class GeomDensity(Geom):
aes_to_arg = {
"fill": ("marker_color", "black"),
Expand All @@ -493,46 +590,79 @@ class GeomDensity(Geom):
"alpha": ("marker_opacity", None)
}

def __init__(self, aes, k=1000, smoothing=0.5, fill=None, color=None, alpha=None):
def __init__(self, aes, k=1000, smoothing=0.5, fill=None, color=None, alpha=None, smoothed=False):
super().__init__(aes)
self.k = k
self.smoothing = smoothing
self.fill = fill
self.color = color
self.alpha = alpha
self.smoothed = smoothed

def apply_to_fig(self, grouped_data, fig_so_far: go.Figure, precomputed, facet_row, facet_col, legend_cache, is_faceted: bool):
def plot_group(df, idx):
slope = 1.0 / (df.attrs['max'] - df.attrs['min'])
n = df.attrs['n']
min = df.attrs['min']
max = df.attrs['max']
values = df.value.to_numpy()
weights = df.weight.to_numpy()

def f(x, prev):
inv_scale = (np.sqrt(n * slope) / self.smoothing) * np.sqrt(prev / weights)
diff = x[:, np.newaxis] - values
grid = (3 / (4 * n)) * weights * np.maximum(0, inv_scale - np.power(diff, 2) * np.power(inv_scale, 3))
return np.sum(grid, axis=1)

round1 = f(values, np.full(len(values), slope))
x_d = np.linspace(min, max, 1000)
final = f(x_d, round1)
data = df.attrs['data']

if self.smoothed:
n = data['ranks'][-1]
weights = np.diff(data['ranks'][1:-1])
min = data['values'][0]
max = data['values'][-1]
values = np.array(data['values'][1:-1])
slope = 1 / (max - min)

def f(x, prev):
ehigham marked this conversation as resolved.
Show resolved Hide resolved
inv_scale = (np.sqrt(n * slope) / self.smoothing) * np.sqrt(prev / weights)
diff = x[:, np.newaxis] - values
grid = (3 / (4 * n)) * weights * np.maximum(0, inv_scale - np.power(diff, 2) * np.power(inv_scale, 3))
return np.sum(grid, axis=1)

round1 = f(values, np.full(len(values), slope))
x_d = np.linspace(min, max, 1000)
final = f(x_d, round1)

trace_args = {
"x": x_d,
"y": final,
"mode": "lines",
"fill": "tozeroy",
"row": facet_row,
"col": facet_col
}

trace_args = {
"x": x_d,
"y": final,
"mode": "lines",
"fill": "tozeroy",
"row": facet_row,
"col": facet_col
}
self._add_aesthetics_to_trace_args(trace_args, df)
self._update_legend_trace_args(trace_args, legend_cache)

self._add_aesthetics_to_trace_args(trace_args, df)
self._update_legend_trace_args(trace_args, legend_cache)
fig_so_far.add_scatter(**trace_args)
else:
confidence = 5

fig_so_far.add_scatter(**trace_args)
y = np.array(data['ranks'][1:-1]) / data['ranks'][-1]
x = np.array(data['values'][1:-1])
min_x = data['values'][0]
max_x = data['values'][-1]
err = _cdf_error(data, 10 ** (-confidence))

new_y, keep = _max_entropy_cdf(min_x, max_x, x, y, err)
slopes = np.diff([0, *new_y[keep], 1]) / np.diff([min_x, *x[keep], max_x])

left = np.concatenate([[min_x], x[keep]])
right = np.concatenate([x[keep], [max_x]])
widths = right - left

trace_args = {
"x": [min_x, *x[keep]],
"y": slopes,
"row": facet_row,
"col": facet_col,
"width": widths,
"offset": 0
}

self._add_aesthetics_to_trace_args(trace_args, df)
self._update_legend_trace_args(trace_args, legend_cache)

fig_so_far.add_bar(**trace_args)

for idx, group_df in enumerate(grouped_data):
plot_group(group_df, idx)
Expand All @@ -541,7 +671,7 @@ def get_stat(self):
return StatCDF(self.k)


def geom_density(mapping=aes(), *, k=1000, smoothing=0.5, fill=None, color=None, alpha=None):
def geom_density(mapping=aes(), *, k=1000, smoothing=0.5, fill=None, color=None, alpha=None, smoothed=False):
"""Creates a smoothed density plot.

This method uses the `hl.agg.approx_cdf` aggregator to compute a sketch
Expand All @@ -568,13 +698,17 @@ def geom_density(mapping=aes(), *, k=1000, smoothing=0.5, fill=None, color=None,
A single line color for all density plots, overrides ``color`` aesthetic.
alpha: `float`
A measure of transparency between 0 and 1.
smoothed: `boolean`
If true, attempts to fit a smooth kernel density estimator.
If false, uses a custom method do generate a variable width histogram
directly from the approx_cdf results.

Returns
Returns
-------
:class:`FigureAttribute`
The geom to be applied.
"""
return GeomDensity(mapping, k, smoothing, fill, color, alpha)
return GeomDensity(mapping, k, smoothing, fill, color, alpha, smoothed)


class GeomHLine(Geom):
Expand Down
10 changes: 2 additions & 8 deletions hail/python/hail/ggplot/stats.py
Expand Up @@ -5,7 +5,6 @@
from pandas import DataFrame

import pandas as pd
import numpy as np

import hail as hl
from hail.utils.java import warning
Expand Down Expand Up @@ -150,14 +149,9 @@ def listify(self, agg_result) -> List[DataFrame]:
result = []

for grouped_struct, data in agg_result.items():
n = data['ranks'][-1]
weights = np.diff(data['ranks'][1:-1])
min = data['values'][0]
max = data['values'][-1]
values = np.array(data['values'][1:-1])
df = pd.DataFrame({'value': values, 'weight': weights})
df = pd.DataFrame()
df.attrs.update(**grouped_struct)
df.attrs.update({'min': min, 'max': max, 'n': n})
df.attrs.update({"data": data})

result.append(df)
return result
2 changes: 1 addition & 1 deletion hail/python/hail/plot/plots.py
Expand Up @@ -223,7 +223,7 @@ def _max_entropy_cdf(min_x, max_x, x, y, e):
def compare(x1, y1, x2, y2):
return x1 * y2 - x2 * y1

new_y = np.full_like(x, 0)
new_y = np.full_like(x, 0.0, dtype=np.float64)
keep = np.full_like(x, False, dtype=np.bool_)

fx = min_x # fixed x
Expand Down