Skip to content

Commit

Permalink
Weighted quantiles shared prework (~7.5 times speedup) (#22)
Browse files Browse the repository at this point in the history
  • Loading branch information
sbrugman authored and tomcis committed Jun 10, 2020
1 parent d6c1729 commit 51dea92
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 34 deletions.
45 changes: 23 additions & 22 deletions popmon/analysis/profiling/hist_profiler.py
Expand Up @@ -11,15 +11,9 @@
DEFAULT_STATS = {
"mean": pm_np.mean,
"std": pm_np.std,
"min": lambda x, w: pm_np.quantile(x, q=0.00, weights=w),
"max": lambda x, w: pm_np.quantile(x, q=1.00, weights=w),
"p01": lambda x, w: pm_np.quantile(x, q=0.01, weights=w),
"p05": lambda x, w: pm_np.quantile(x, q=0.05, weights=w),
"p16": lambda x, w: pm_np.quantile(x, q=0.16, weights=w),
"p50": lambda x, w: pm_np.quantile(x, q=0.50, weights=w),
"p84": lambda x, w: pm_np.quantile(x, q=0.84, weights=w),
"p95": lambda x, w: pm_np.quantile(x, q=0.95, weights=w),
"p99": lambda x, w: pm_np.quantile(x, q=0.99, weights=w),
"min,max,p01,p05,p16,p50,p84,p95,p99": lambda x, w: pm_np.quantile(
x, q=[0.0, 1.0, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99], weights=w
),
}
NUM_NS_DAY = 24 * 3600 * int(1e9)

Expand Down Expand Up @@ -78,7 +72,7 @@ def __init__(

self.stats_functions = stats_functions
if self.stats_functions is None:
self.stats_functions = dict(DEFAULT_STATS)
self.stats_functions = DEFAULT_STATS
self.logger.debug(
f"No stats function dict is provided. {self.stats_functions.keys()} is set as default"
)
Expand Down Expand Up @@ -113,13 +107,21 @@ def _profile_1d_histogram(self, name, hc):
profile["most_probable_value"] = mpv if not is_ts else pd.Timestamp(mpv)

if is_num and profile["filled"] > 0:
for f_name, func in self.stats_functions.items():
profile[f_name] = func(bin_labels, bin_counts)
for f_names, func in self.stats_functions.items():
names = f_names.split(",")
results = func(bin_labels, bin_counts)
if len(names) == 1:
results = [results]

if is_ts:
pf = profile[f_name]
profile[f_name] = (
pd.Timedelta(pf) if f_name == "std" else pd.Timestamp(pf)
)
results = [
pd.Timedelta(result)
if f_name == "std"
else pd.Timestamp(result)
for f_name, result in zip(name, results)
]

profile.update({k: v for k, v in zip(names, results)})
elif not is_num:
profile["fraction_true"] = pm_np.fraction_of_true(bin_labels, bin_counts)

Expand All @@ -143,14 +145,13 @@ def _profile_2d_histogram(self, name, hc):
try:
phi_k = phik.phik_from_hist2d(observed=grid)
# p, Z = significance.significance_from_hist2d(values=grid, significance_method='asymptotic')
profile = dict(phik=phi_k)
except AssertionError:
except ValueError:
self.logger.debug(
f"Not enough values in the 2d `{name}` time-split histogram to apply the phik test."
)
profile = dict(phik=np.nan)
phi_k = np.nan

return {"count": sume, **profile}
return {"count": sume, "phik": phi_k}

def _profile_hist(self, split, hist_name):
if len(split) == 0:
Expand All @@ -162,11 +163,11 @@ def _profile_hist(self, split, hist_name):
is_num = hist0.is_num

# these are the profiled quantities we will monitor
fields = list()
fields = []
if dimension == 1:
fields = list(self.general_stats_1d)
fields += (
[key for key, value in self.stats_functions.items()]
[v for key in self.stats_functions.keys() for v in key.split(",")]
if is_num
else list(self.category_stats_1d)
)
Expand Down
27 changes: 15 additions & 12 deletions popmon/stats/numpy.py
Expand Up @@ -115,7 +115,7 @@ def median(a, weights=None, axis=None, keepdims=False):
return quantile(a, q=0.5, weights=weights, axis=axis, keepdims=keepdims)


def quantile(a, q, weights=None, axis=None, keepdims=False):
def quantile(a, q, weights=None, axis=None, keepdims: bool = False):
"""
Compute the weighted quantiles along the specified axis
Expand All @@ -140,7 +140,7 @@ def quantile(a, q, weights=None, axis=None, keepdims=False):
"""
q = q if not hasattr(q, "__iter__") else q[0] if len(q) == 1 else tuple(q)
if weights is None:
return np.quantile(a, q, axis=axis, keepdims=keepdims, interpolation="linear")
return np.quantile(a, q, axis=axis, keepdims=keepdims)
elif axis is None:
raveled_data = np.ravel(a)
idx = np.argsort(raveled_data)
Expand All @@ -149,10 +149,10 @@ def quantile(a, q, weights=None, axis=None, keepdims=False):
Sn = np.cumsum(sorted_weights)
Pn = (Sn - 0.5 * sorted_weights) / Sn[-1]
y = np.interp(q, Pn, sorted_data)
if keepdims is True:
return y.reshape((*y.shape, *(1,) * np.ndim(a)))
else:
return y
if keepdims:
y = y.reshape((*y.shape, *(1,) * np.ndim(a)))

return y
else:
# Move the dimensions which are reduced to the back
axis = [axis] if not hasattr(axis, "__iter__") else axis
Expand All @@ -167,12 +167,15 @@ def quantile(a, q, weights=None, axis=None, keepdims=False):
w = np.moveaxis(weights, source=axis, destination=destination).reshape(shape)

# Determine the quantiles and reshape backwards
y = np.array([quantile(x, q, u, keepdims=False) for x, u in zip(a_shaped, w)]).T
shape = (
(*y.shape[:-1], *[1 if i in axis else x for i, x in enumerate(a.shape)])
if keepdims is True
else (*y.shape[:-1], *a_moved.shape[: -len(destination)])
)
y = np.array([quantile(x, q, u) for x, u in zip(a_shaped, w)]).T
if keepdims:
shape = (
*y.shape[:-1],
*[1 if i in axis else x for i, x in enumerate(a.shape)],
)
else:
shape = *y.shape[:-1], *a_moved.shape[: -len(destination)]

y = y.reshape(shape)
return y

Expand Down

0 comments on commit 51dea92

Please sign in to comment.