diff --git a/fireant/formats.py b/fireant/formats.py index 32258e5c..fef7d49d 100644 --- a/fireant/formats.py +++ b/fireant/formats.py @@ -6,11 +6,7 @@ time, ) -from fireant.utils import ( - MAX_NUMBER, - MAX_STRING, - MAX_TIMESTAMP, -) +from fireant.slicer.totals import TOTALS_MARKERS INF_VALUE = "Inf" NULL_VALUE = 'null' @@ -68,7 +64,7 @@ def dimension_value(value): When True, dates and datetimes will be converted to ISO strings. The time is omitted for dates. When False, the datetime will be converted to a POSIX timestamp (millis-since-epoch). """ - if value in {MAX_STRING, MAX_NUMBER, MAX_TIMESTAMP}: + if value in TOTALS_MARKERS: return 'Totals' if pd.isnull(value): diff --git a/fireant/slicer/__init__.py b/fireant/slicer/__init__.py index 6ae2cb41..4345c865 100644 --- a/fireant/slicer/__init__.py +++ b/fireant/slicer/__init__.py @@ -29,6 +29,7 @@ CumSum, Operation, RollingMean, + Share, ) from .references import ( DayOverDay, diff --git a/fireant/slicer/dimensions.py b/fireant/slicer/dimensions.py index 8972fc10..70ffff1c 100644 --- a/fireant/slicer/dimensions.py +++ b/fireant/slicer/dimensions.py @@ -76,10 +76,11 @@ def is_(self, value: bool): :return: A slicer query filter used to filter a slicer query to results where this dimension is True or False. """ - return BooleanFilter(self.definition, value) + return BooleanFilter(self.key, self.definition, value) class PatternFilterableMixin: + key = None definition = None pattern_definition_attribute = 'definition' @@ -98,7 +99,7 @@ def like(self, pattern, *patterns): matches the pattern. """ definition = getattr(self, self.pattern_definition_attribute) - return PatternFilter(definition, pattern, *patterns) + return PatternFilter(self.key, definition, pattern, *patterns) def not_like(self, pattern, *patterns): """ @@ -115,7 +116,7 @@ def not_like(self, pattern, *patterns): matches the pattern. """ definition = getattr(self, self.pattern_definition_attribute) - return AntiPatternFilter(definition, pattern, *patterns) + return AntiPatternFilter(self.key, definition, pattern, *patterns) class CategoricalDimension(PatternFilterableMixin, Dimension): @@ -142,7 +143,7 @@ def isin(self, values: Iterable): A slicer query filter used to filter a slicer query to results where this dimension is one of a set of values. Opposite of #notin. """ - return ContainsFilter(self.definition, values) + return ContainsFilter(self.key, self.definition, values) def notin(self, values): """ @@ -155,7 +156,7 @@ def notin(self, values): A slicer query filter used to filter a slicer query to results where this dimension is *not* one of a set of values. Opposite of #isin. """ - return ExcludesFilter(self.definition, values) + return ExcludesFilter(self.key, self.definition, values) class _UniqueDimensionBase(PatternFilterableMixin, Dimension): @@ -170,7 +171,7 @@ def isin(self, values): A slicer query filter used to filter a slicer query to results where this dimension is one of a set of values. Opposite of #notin. """ - return ContainsFilter(self.definition, values) + return ContainsFilter(self.key, self.definition, values) def notin(self, values): """ @@ -183,7 +184,7 @@ def notin(self, values): A slicer query filter used to filter a slicer query to results where this dimension is *not* one of a set of values. Opposite of #isin. """ - return ExcludesFilter(self.definition, values) + return ExcludesFilter(self.key, self.definition, values) class UniqueDimension(_UniqueDimensionBase): @@ -288,7 +289,7 @@ def between(self, start, stop): A slicer query filter used to filter a slicer query to results where this dimension is between the values start and stop. """ - return RangeFilter(self.definition, start, stop) + return RangeFilter(self.key, self.definition, start, stop) class TotalsDimension(Dimension): diff --git a/fireant/slicer/exceptions.py b/fireant/slicer/exceptions.py index 71b28731..c7401d49 100644 --- a/fireant/slicer/exceptions.py +++ b/fireant/slicer/exceptions.py @@ -18,6 +18,10 @@ class RollupException(SlicerException): pass +class MissingTotalsForShareException(Exception): + pass + + class MetricRequiredException(SlicerException): pass diff --git a/fireant/slicer/filters.py b/fireant/slicer/filters.py index 2199e679..7ad87428 100644 --- a/fireant/slicer/filters.py +++ b/fireant/slicer/filters.py @@ -15,11 +15,15 @@ def __repr__(self): class DimensionFilter(Filter): - pass + def __init__(self, dimension_key, definition): + super().__init__(definition) + self.dimension_key = dimension_key class MetricFilter(Filter): - pass + def __init__(self, metric_key, definition): + super().__init__(definition) + self.metric_key = metric_key class ComparatorFilter(MetricFilter): @@ -31,36 +35,43 @@ class Operator(object): gte = 'gte' lte = 'lte' - def __init__(self, metric_definition, operator, value): + def __init__(self, metric_key, metric_definition, operator, value): definition = getattr(metric_definition, operator)(value) - super(ComparatorFilter, self).__init__(definition) + super(ComparatorFilter, self).__init__(metric_key, definition) class BooleanFilter(DimensionFilter): - def __init__(self, element_key, value): - definition = element_key if value else Not(element_key) - super(BooleanFilter, self).__init__(definition) + def __init__(self, dimension_key, dimension_definition, value): + definition = dimension_definition \ + if value \ + else Not(dimension_definition) + + super(BooleanFilter, self).__init__(dimension_key, definition) class ContainsFilter(DimensionFilter): - def __init__(self, dimension_definition, values): + def __init__(self, dimension_key, dimension_definition, values): definition = dimension_definition.isin(values) - super(ContainsFilter, self).__init__(definition) + super(ContainsFilter, self).__init__(dimension_key, definition) class ExcludesFilter(DimensionFilter): - def __init__(self, dimension_definition, values): + def __init__(self, dimension_key, dimension_definition, values): definition = dimension_definition.notin(values) - super(ExcludesFilter, self).__init__(definition) + super(ExcludesFilter, self).__init__(dimension_key, definition) class RangeFilter(DimensionFilter): - def __init__(self, dimension_definition, start, stop): + def __init__(self, dimension_key, dimension_definition, start, stop): definition = dimension_definition[start:stop] - super(RangeFilter, self).__init__(definition) + super(RangeFilter, self).__init__(dimension_key, definition) class PatternFilter(DimensionFilter): + def __init__(self, dimension_key, dimension_definition, pattern, *patterns): + definition = self._apply(dimension_definition, (pattern,) + patterns) + super(PatternFilter, self).__init__(dimension_key, definition) + def _apply(self, dimension_definition, patterns): definition = Lower(dimension_definition).like(Lower(patterns[0])) @@ -69,10 +80,6 @@ def _apply(self, dimension_definition, patterns): return definition - def __init__(self, dimension_definition, pattern, *patterns): - definition = self._apply(dimension_definition, (pattern,) + patterns) - super(PatternFilter, self).__init__(definition) - class AntiPatternFilter(PatternFilter): def _apply(self, dimension_definition, pattern): diff --git a/fireant/slicer/metrics.py b/fireant/slicer/metrics.py index 49d767c6..5c1dd30f 100644 --- a/fireant/slicer/metrics.py +++ b/fireant/slicer/metrics.py @@ -1,3 +1,6 @@ +from fireant.utils import ( + immutable, +) from .base import SlicerElement from .filters import ComparatorFilter @@ -30,24 +33,37 @@ def __init__(self, key, definition, label=None, precision=None, prefix=None, suf self.precision = precision self.prefix = prefix self.suffix = suffix + self._share = False def __eq__(self, other): - return ComparatorFilter(self.definition, ComparatorFilter.Operator.eq, other) + return ComparatorFilter(self.key, self.definition, ComparatorFilter.Operator.eq, other) def __ne__(self, other): - return ComparatorFilter(self.definition, ComparatorFilter.Operator.ne, other) + return ComparatorFilter(self.key, self.definition, ComparatorFilter.Operator.ne, other) def __gt__(self, other): - return ComparatorFilter(self.definition, ComparatorFilter.Operator.gt, other) + return ComparatorFilter(self.key, self.definition, ComparatorFilter.Operator.gt, other) def __ge__(self, other): - return ComparatorFilter(self.definition, ComparatorFilter.Operator.gte, other) + return ComparatorFilter(self.key, self.definition, ComparatorFilter.Operator.gte, other) def __lt__(self, other): - return ComparatorFilter(self.definition, ComparatorFilter.Operator.lt, other) + return ComparatorFilter(self.key, self.definition, ComparatorFilter.Operator.lt, other) def __le__(self, other): - return ComparatorFilter(self.definition, ComparatorFilter.Operator.lte, other) + return ComparatorFilter(self.key, self.definition, ComparatorFilter.Operator.lte, other) def __repr__(self): return "slicer.metrics.{}".format(self.key) + + @property + @immutable + def share(self): + self._share = True + return self + + @property + @immutable + def share(self): + self._share = True + return self diff --git a/fireant/slicer/operations.py b/fireant/slicer/operations.py index cdae5a6b..52308741 100644 --- a/fireant/slicer/operations.py +++ b/fireant/slicer/operations.py @@ -1,8 +1,14 @@ import numpy as np import pandas as pd -from fireant.utils import format_metric_key from fireant.slicer.references import reference_key +from fireant.slicer.totals import get_totals_marker_for_dtype +from fireant.utils import ( + format_dimension_key, + format_metric_key, + reduce_data_frame_levels, +) +from .dimensions import Dimension from .metrics import Metric @@ -194,3 +200,61 @@ def apply(self, data_frame, reference): .apply(self.rolling_mean) return self.rolling_mean(data_frame[df_key]) + + +class Share(_BaseOperation): + def __init__(self, metric: Metric, over: Dimension = None, precision=2): + super(Share, self).__init__( + key='share({},{})'.format(getattr(metric, 'key', metric), + getattr(over, 'key', over), ), + label='Share of {} over {}'.format(getattr(metric, 'label', metric), + getattr(over, 'label', over)), + prefix=None, + suffix='%', + precision=precision, + ) + + self.metric = metric + self.over = over + + @property + def metrics(self): + return [metric + for metric in [self.metric] + if isinstance(metric, Metric)] + + @property + def operations(self): + return [op_and_children + for operation in [self.metric] + if isinstance(operation, Operation) + for op_and_children in [operation] + operation.operations] + + def apply(self, data_frame, reference): + f_metric_key = format_metric_key(reference_key(self.metric, reference)) + + if self.over is None: + df = data_frame[f_metric_key] + return 100 * df / df + + if not isinstance(data_frame.index, pd.MultiIndex): + marker = get_totals_marker_for_dtype(data_frame.index.dtype) + totals = data_frame.loc[marker, f_metric_key] + return 100 * data_frame[f_metric_key] / totals + + f_over_key = format_dimension_key(self.over.key) + idx = data_frame.index.names.index(f_over_key) + group_levels = data_frame.index.names[idx:] + over_dim_value = get_totals_marker_for_dtype(data_frame.index.levels[idx].dtype) + totals_key = (slice(None),) * idx + (slice(over_dim_value, over_dim_value),) + + totals = reduce_data_frame_levels(data_frame.loc[totals_key, f_metric_key], group_levels) + + def apply_totals(df): + return 100 * reduce_data_frame_levels(df / totals, group_levels) + + return data_frame[f_metric_key] \ + .groupby(level=group_levels) \ + .apply(apply_totals) \ + .reorder_levels(order=data_frame.index.names) \ + .sort_index() diff --git a/fireant/slicer/queries/builder.py b/fireant/slicer/queries/builder.py index fce01607..9ca01e6a 100644 --- a/fireant/slicer/queries/builder.py +++ b/fireant/slicer/queries/builder.py @@ -13,24 +13,26 @@ ) from pypika import Order from . import special_cases -from .database import fetch_data +from .execution import fetch_data from .finders import ( find_and_group_references_for_dimensions, find_and_replace_reference_dimensions, find_metrics_for_widgets, find_operations_for_widgets, + find_share_dimensions, ) -from .makers import ( +from .pagination import paginate +from .sql_transformer import ( make_latest_query, make_orders_for_dimensions, make_slicer_query, - make_slicer_query_with_rollup_and_references, + make_slicer_query_with_totals_and_references, ) -from .pagination import paginate from .. import QueryException from ..base import SlicerElement from ..dimensions import Dimension from ..references import reference_key +from ..totals import scrub_totals_from_share_results def add_hints(queries, hint=None): @@ -117,6 +119,13 @@ def __init__(self, slicer): super(SlicerQueryBuilder, self).__init__(slicer, slicer.table) self._widgets = [] self._orders = [] + self.filter_totals = True + + @immutable + def __call__(self, **kwargs): + for setting in ('filter_totals',): + if setting in kwargs: + setattr(self, setting, kwargs[setting]) @immutable def widget(self, *widgets): @@ -165,6 +174,15 @@ def orderby(self, element: SlicerElement, orientation: Order = None): self._orders += [(element.definition.as_(format_key(element.key)), orientation)] + def _validate(self): + for widget in self._widgets: + if hasattr(widget, 'validate'): + widget.validate(self._dimensions) + + @property + def reference_groups(self): + return list(find_and_group_references_for_dimensions(self._references).values()) + @property def queries(self): """ @@ -176,19 +194,18 @@ def queries(self): a query for each reference is joined based on the referenced dimension shifted. """ # First run validation for the query on all widgets - for widget in self._widgets: - if hasattr(widget, 'validate'): - widget.validate(self._dimensions) + self._validate() # Optionally select all metrics for slicer to better utilize caching metrics = list(self.slicer.metrics) \ if self.slicer.always_query_all_metrics \ else find_metrics_for_widgets(self._widgets) operations = find_operations_for_widgets(self._widgets) + share_dimensions = find_share_dimensions(self._dimensions, operations) references = find_and_replace_reference_dimensions(self._references, self._dimensions) orders = (self._orders or make_orders_for_dimensions(self._dimensions)) - return make_slicer_query_with_rollup_and_references(self.slicer.database, + return make_slicer_query_with_totals_and_references(self.slicer.database, self.table, self.slicer.joins, self._dimensions, @@ -196,7 +213,9 @@ def queries(self): operations, self._filters, references, - orders) + orders, + share_dimensions=share_dimensions, + filter_totals=self.filter_totals) def fetch(self, hint=None) -> Iterable[Dict]: """ @@ -209,18 +228,23 @@ def fetch(self, hint=None) -> Iterable[Dict]: """ queries = add_hints(self.queries, hint) - reference_groups = list(find_and_group_references_for_dimensions(self._references).values()) - data_frame = fetch_data(self.slicer.database, queries, self._dimensions, reference_groups) + operations = find_operations_for_widgets(self._widgets) + share_dimensions = find_share_dimensions(self._dimensions, operations) + + data_frame = fetch_data(self.slicer.database, + queries, + self._dimensions, + share_dimensions, + self.reference_groups) # Apply operations - operations = find_operations_for_widgets(self._widgets) for operation in operations: for reference in [None] + self._references: df_key = format_metric_key(reference_key(operation, reference)) data_frame[df_key] = operation.apply(data_frame, reference) + data_frame = scrub_totals_from_share_results(data_frame, self._dimensions) data_frame = special_cases.apply_operations_to_data_frame(operations, data_frame) - data_frame = paginate(data_frame, self._widgets, orders=self._orders, @@ -356,7 +380,7 @@ def queries(self): return [query] def fetch(self, hint=None): - data = super().fetch(hint=hint).reset_index().ix[0] + data = super().fetch(hint=hint).reset_index().iloc[0] # Remove the row index as the name and trim the special dimension key characters from the dimension key data.name = None data.index = [key[3:] for key in data.index] diff --git a/fireant/slicer/queries/database.py b/fireant/slicer/queries/execution.py similarity index 84% rename from fireant/slicer/queries/database.py rename to fireant/slicer/queries/execution.py index cf10fa6f..a047b432 100644 --- a/fireant/slicer/queries/database.py +++ b/fireant/slicer/queries/execution.py @@ -10,25 +10,26 @@ Union, ) -import numpy as np import pandas as pd from fireant.database import Database +from fireant.slicer.totals import get_totals_marker_for_dtype from fireant.utils import ( - MAX_NUMBER, - MAX_STRING, - MAX_TIMESTAMP, chunks, format_dimension_key, ) -from .logger import ( +from .finders import find_totals_dimensions +from .slow_query_logger import ( query_logger, slow_query_logger, ) from ..dimensions import Dimension -def fetch_data(database: Database, queries: Union[Sized, Iterable], dimensions: Iterable[Dimension], +def fetch_data(database: Database, + queries: Union[Sized, Iterable], + dimensions: Iterable[Dimension], + share_dimensions: Iterable[Dimension] = (), reference_groups=()): iterable = [(str(query.limit(int(database.max_result_set_size))), database) for query in queries] @@ -37,7 +38,7 @@ def fetch_data(database: Database, queries: Union[Sized, Iterable], dimensions: results = pool.map(_exec, iterable) pool.close() - return _reduce_result_set(results, reference_groups, dimensions) + return _reduce_result_set(results, reference_groups, dimensions, share_dimensions) def _exec(args): @@ -92,7 +93,10 @@ def _do_fetch_data(query: str, database: Database): return pd.read_sql(query, connection, coerce_float=True, parse_dates=True) -def _reduce_result_set(results: Iterable[pd.DataFrame], reference_groups, dimensions: Iterable[Dimension]): +def _reduce_result_set(results: Iterable[pd.DataFrame], + reference_groups, + dimensions: Iterable[Dimension], + share_dimensions: Dimension): """ Reduces the result sets from individual queries into a single data frame. This effectively joins sets of references and concats the sets of totals. @@ -108,10 +112,9 @@ def _reduce_result_set(results: Iterable[pd.DataFrame], reference_groups, dimens dimension_keys = [format_dimension_key(d.key) for d in dimensions] - rollup_dimension_keys = [format_dimension_key(d.key) - for d in dimensions - if d.is_rollup] - rollup_dimension_dtypes = result_groups[0][0][rollup_dimension_keys].dtypes + totals_dimension_keys = [format_dimension_key(d.key) + for d in find_totals_dimensions(dimensions, share_dimensions)] + dimension_dtypes = result_groups[0][0][dimension_keys].dtypes # Reduce each group to one data frame per rolled up dimension group_data_frames = [] @@ -132,8 +135,8 @@ def _reduce_result_set(results: Iterable[pd.DataFrame], reference_groups, dimens # marker to indicate totals. # The data frames will be ordered so that the first group will contain the data without any rolled up # dimensions, then followed by the groups with them, ordered by the last rollup dimension first. - if rollup_dimension_keys[:i]: - reduced = _replace_nans_for_rollup_values(reduced, rollup_dimension_dtypes[-i:]) + if totals_dimension_keys[:i]: + reduced = _replace_nans_for_totals_values(reduced, dimension_dtypes[-i - 1:]) group_data_frames.append(reduced) @@ -141,19 +144,14 @@ def _reduce_result_set(results: Iterable[pd.DataFrame], reference_groups, dimens .sort_index(na_position='first') -def _replace_nans_for_rollup_values(data_frame, dtypes): - replace = { - np.dtype('