Skip to content

Commit

Permalink
Weighted quantiles shared prework (~7.5 times speedup)
Browse files Browse the repository at this point in the history
  • Loading branch information
sbrugman committed May 29, 2020
1 parent 5058746 commit 9b64704
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 31 deletions.
41 changes: 19 additions & 22 deletions popmon/analysis/profiling/hist_profiler.py
Expand Up @@ -7,17 +7,6 @@
import popmon.stats.numpy as pm_np


DEFAULT_STATS = {"mean": pm_np.mean,
"std": pm_np.std,
"min": lambda x, w: pm_np.quantile(x, q=0.00, weights=w),
"max": lambda x, w: pm_np.quantile(x, q=1.00, weights=w),
"p01": lambda x, w: pm_np.quantile(x, q=0.01, weights=w),
"p05": lambda x, w: pm_np.quantile(x, q=0.05, weights=w),
"p16": lambda x, w: pm_np.quantile(x, q=0.16, weights=w),
"p50": lambda x, w: pm_np.quantile(x, q=0.50, weights=w),
"p84": lambda x, w: pm_np.quantile(x, q=0.84, weights=w),
"p95": lambda x, w: pm_np.quantile(x, q=0.95, weights=w),
"p99": lambda x, w: pm_np.quantile(x, q=0.99, weights=w)}
NUM_NS_DAY = 24 * 3600 * int(1e9)


Expand Down Expand Up @@ -57,7 +46,11 @@ def __init__(self, read_key, store_key, features=None, ignore_features=None, var

self.stats_functions = stats_functions
if self.stats_functions is None:
self.stats_functions = dict(DEFAULT_STATS)
self.stats_functions = {
"mean": pm_np.mean,
"std": pm_np.std,
"min,max,p01,p05,p16,p50,p84,p95,p99": lambda x, w: pm_np.quantile(x, q=[0.0, 1.0, 0.01, 0.05, 0.16, 0.50, 0.84, 0.95, 0.99], weights=w),
}
self.logger.debug(f"No stats function dict is provided. {self.stats_functions.keys()} is set as default")

def _profile_1d_histogram(self, name, hc):
Expand Down Expand Up @@ -86,11 +79,16 @@ def _profile_1d_histogram(self, name, hc):
profile["most_probable_value"] = mpv if not is_ts else pd.Timestamp(mpv)

if is_num and profile["filled"] > 0:
for f_name, func in self.stats_functions.items():
profile[f_name] = func(bin_labels, bin_counts)
for f_names, func in self.stats_functions.items():
names = f_names.split(",")
results = func(bin_labels, bin_counts)
if len(names) == 1:
results = [results]

if is_ts:
pf = profile[f_name]
profile[f_name] = pd.Timedelta(pf) if f_name == "std" else pd.Timestamp(pf)
results = [pd.Timedelta(result) if f_name == "std" else pd.Timestamp(result) for f_name, result in zip(name, results)]

profile.update({k: v for k, v in zip(names, results)})
elif not is_num:
profile['fraction_true'] = pm_np.fraction_of_true(bin_labels, bin_counts)

Expand All @@ -112,12 +110,11 @@ def _profile_2d_histogram(self, name, hc):
try:
phi_k = phik.phik_from_hist2d(observed=grid)
# p, Z = significance.significance_from_hist2d(values=grid, significance_method='asymptotic')
profile = dict(phik=phi_k)
except AssertionError:
except ValueError:
self.logger.debug(f'Not enough values in the 2d `{name}` time-split histogram to apply the phik test.')
profile = dict(phik=np.nan)
phi_k = np.nan

return {'count': sume, **profile}
return {'count': sume, 'phik': phi_k}

def _profile_hist(self, split, hist_name):
if len(split) == 0:
Expand All @@ -129,10 +126,10 @@ def _profile_hist(self, split, hist_name):
is_num = hist0.is_num

# these are the profiled quantities we will monitor
fields = list()
fields = []
if dimension == 1:
fields = list(self.general_stats_1d)
fields += [key for key, value in self.stats_functions.items()] if is_num else list(self.category_stats_1d)
fields += [v for key in self.stats_functions.keys() for v in key.split(",")] if is_num else list(self.category_stats_1d)
elif dimension == 2:
fields = list(self.general_stats_2d)

Expand Down
21 changes: 12 additions & 9 deletions popmon/stats/numpy.py
Expand Up @@ -105,7 +105,7 @@ def median(a, weights=None, axis=None, keepdims=False):
return quantile(a, q=0.5, weights=weights, axis=axis, keepdims=keepdims)


def quantile(a, q, weights=None, axis=None, keepdims=False):
def quantile(a, q, weights=None, axis=None, keepdims: bool = False):
"""
Compute the weighted quantiles along the specified axis
Expand All @@ -130,7 +130,7 @@ def quantile(a, q, weights=None, axis=None, keepdims=False):
"""
q = q if not hasattr(q, "__iter__") else q[0] if len(q) == 1 else tuple(q)
if weights is None:
return np.quantile(a, q, axis=axis, keepdims=keepdims, interpolation='linear')
return np.quantile(a, q, axis=axis, keepdims=keepdims)
elif axis is None:
raveled_data = np.ravel(a)
idx = np.argsort(raveled_data)
Expand All @@ -139,10 +139,10 @@ def quantile(a, q, weights=None, axis=None, keepdims=False):
Sn = np.cumsum(sorted_weights)
Pn = (Sn - 0.5*sorted_weights)/Sn[-1]
y = np.interp(q, Pn, sorted_data)
if keepdims is True:
return y.reshape((*y.shape, *(1, )*np.ndim(a)))
else:
return y
if keepdims:
y = y.reshape((*y.shape, *(1, )*np.ndim(a)))

return y
else:
# Move the dimensions which are reduced to the back
axis = [axis] if not hasattr(axis, "__iter__") else axis
Expand All @@ -157,9 +157,12 @@ def quantile(a, q, weights=None, axis=None, keepdims=False):
w = np.moveaxis(weights, source=axis, destination=destination).reshape(shape)

# Determine the quantiles and reshape backwards
y = np.array([quantile(x, q, u, keepdims=False) for x, u in zip(a_shaped, w)]).T
shape = (*y.shape[:-1], *[1 if i in axis else x for i, x in enumerate(a.shape)]) if keepdims is True\
else (*y.shape[:-1], *a_moved.shape[:-len(destination)])
y = np.array([quantile(x, q, u) for x, u in zip(a_shaped, w)]).T
if keepdims:
shape = *y.shape[:-1], *[1 if i in axis else x for i, x in enumerate(a.shape)]
else:
shape = *y.shape[:-1], *a_moved.shape[:-len(destination)]

y = y.reshape(shape)
return y

Expand Down

0 comments on commit 9b64704

Please sign in to comment.