Skip to content

Commit

Permalink
Merge pull request #360 from kayak/fix-filter-set-when-dealing-with-m…
Browse files Browse the repository at this point in the history
…ultiple-patterns

Fix filter set when dealing with multiple pattern criterions
  • Loading branch information
x8lucas8x committed Apr 22, 2021
2 parents 945fb52 + cc867c8 commit eab3026
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 29 deletions.
113 changes: 85 additions & 28 deletions fireant/queries/sets.py
@@ -1,5 +1,5 @@
from copy import deepcopy
from typing import List, Tuple
from typing import List, Tuple, Union

from pypika import (
Case,
Expand All @@ -21,6 +21,78 @@
)


def _definition_field_for_data_blending(
target_dataset_definition: Union[Field, terms.Term],
target_dataset_leaf_definition: terms.Field,
definition: terms.Term,
) -> terms.Term:
"""
When using data blending, the dataset table of the set filter needs to be re-mapped to the table in the
target dataset (i.e. primary or secondary). The easiest way to do that is to select the field in the
target dataset directly. Otherwise table not found issues would pop up when resolving the joins.
:param target_dataset_definition: The definition for a field in the target dataset.
:param target_dataset_leaf_definition: The leaf definition for a field in the target dataset.
Given sometimes a fireant's Field might have nested fireant's
Fields.
:param definition: A definition that might have its sub-parts (e.g. term, left, right) replaced.
That's likely the case for Criterion sub-classes and so on.
:return: A term sub-class to be used in place of the provided definition argument, when applicable.
"""
if isinstance(definition, (terms.ValueWrapper,)):
# Constant values can be returned as is.
return definition

if isinstance(definition, (terms.Field,)) and definition == target_dataset_leaf_definition:
target_dataset_leaf_definition.replace_table(target_dataset_leaf_definition.table, definition.table)
return target_dataset_definition

# Function, ...
if hasattr(definition, 'args'):
definition.args = [
_definition_field_for_data_blending(target_dataset_definition, target_dataset_leaf_definition, arg)
for arg in definition.args
]

# CustomFunction, ...
if hasattr(definition, 'params'):
definition.params = [
_definition_field_for_data_blending(target_dataset_definition, target_dataset_leaf_definition, param)
for param in definition.params
]

# RangeCriterion, ContainsCriterion, Not, All,
if hasattr(definition, 'term'):
definition.term = _definition_field_for_data_blending(
target_dataset_definition, target_dataset_leaf_definition, definition.term
)

# BasicCriterion, ComplexCriterion, ...
if hasattr(definition, 'left'):
definition.left = _definition_field_for_data_blending(
target_dataset_definition, target_dataset_leaf_definition, definition.left
)

if hasattr(definition, 'right'):
definition.right = _definition_field_for_data_blending(
target_dataset_definition, target_dataset_leaf_definition, definition.right
)

# Case
if hasattr(definition, '_cases'):
definition._cases = [
_definition_field_for_data_blending(target_dataset_definition, target_dataset_leaf_definition, case)
for case in definition._cases
]

if hasattr(definition, '_else'):
definition._else = _definition_field_for_data_blending(
target_dataset_definition, target_dataset_leaf_definition, definition._else
)

return definition


def _make_set_dimension(set_filter: Field, target_dataset: 'DataSet') -> Field:
"""
Returns a new dimension that uses a CASE statement as its definition, in order to represent membership to a set,
Expand All @@ -44,34 +116,19 @@ def _make_set_dimension(set_filter: Field, target_dataset: 'DataSet') -> Field:
is_metric = is_metric_field(set_dimension)

# When using data blending, the dataset table of the set filter needs to be re-mapped to the table in the
# target dataset (i.e. primary or secondary). The easiest way to do that is to select the field in the
# target dataset directly.
if (
target_dataset
and not is_metric
and isinstance(
old_definition,
(
terms.ContainsCriterion,
terms.NullCriterion,
terms.BetweenCriterion,
terms.BitwiseAndCriterion,
),
# target dataset (i.e. primary or secondary). Otherwise table not found issues would pop up when resolving
# the joins.
if target_dataset and not is_metric:
target_dataset_definition = target_dataset.fields[set_dimension.alias].definition
target_dataset_leaf_definition = target_dataset_definition

while hasattr(target_dataset_leaf_definition, 'definition'):
# Sometimes a fireant's Field can have nested fireant Fields.
target_dataset_leaf_definition = target_dataset_leaf_definition.definition

old_definition = _definition_field_for_data_blending(
target_dataset_definition, target_dataset_leaf_definition, old_definition
)
):
target_dataset_definition = deepcopy(target_dataset.fields[set_dimension.alias].definition)

if not isinstance(old_definition.term, (terms.ValueWrapper, terms.Function)):
old_definition.term = target_dataset_definition

if target_dataset and not is_metric and isinstance(old_definition, terms.BasicCriterion):
target_dataset_definition = deepcopy(target_dataset.fields[set_dimension.alias].definition)

if not isinstance(old_definition.left, (terms.ValueWrapper, terms.Function)):
old_definition.left = target_dataset_definition

if not isinstance(old_definition.right, (terms.ValueWrapper, terms.Function)):
old_definition.right = target_dataset_definition

set_term = set_filter.set_label
complement_term = set_filter.complement_label
Expand Down
37 changes: 36 additions & 1 deletion fireant/tests/queries/test_build_data_blending.py
@@ -1,7 +1,7 @@
import copy
from unittest import TestCase

from pypika import Order
from pypika import Order, functions as fn

import fireant as f
from fireant.tests.dataset.mocks import (
Expand Down Expand Up @@ -585,6 +585,8 @@ def test_dimension_filter_variations_with_sets_for_data_blending(self):
for field_alias, fltr in [
('state', mock_dataset_blender.fields.state.like("%abc%")),
('state', mock_dataset_blender.fields.state.not_like("%abc%")),
('state', mock_dataset_blender.fields.state.like("%abc%", "%cde%")),
('state', mock_dataset_blender.fields.state.not_like("%abc%", "%cde%")),
('state', mock_dataset_blender.fields.state.isin(["abc"])),
('state', mock_dataset_blender.fields.state.notin(["abc"])),
('timestamp', mock_dataset_blender.fields.timestamp.between('date1', 'date2')),
Expand Down Expand Up @@ -623,6 +625,39 @@ def test_dimension_filter_variations_with_sets_for_data_blending(self):
str(queries[0]),
)

def test_deeply_nested_dimension_filter_with_sets_for_data_blending(self):
field_alias = 'state'
fltr = mock_dataset_blender.fields.state.like(
fn.Concat(
fn.Upper(fn.Trim(fn.Concat('%ab', mock_dataset_blender.fields['candidate-id']))),
mock_dataset_blender.fields.winner,
fn.Concat(mock_dataset_blender.fields.timestamp.between('date1', 'date2'), 'c%'),
)
)
queries = (
mock_dataset_blender.query.widget(f.Pandas(mock_dataset_blender.fields['candidate-spend']))
.dimension(mock_dataset_blender.fields[field_alias])
.filter(f.ResultSet(fltr, set_label='set_A', complement_label='set_B'))
.sql
)

self.assertEqual(len(queries), 1)
self.assertEqual(
"SELECT "
f'"sq0"."${field_alias}" "${field_alias}",'
'"sq0"."$candidate-spend" "$candidate-spend" '
'FROM ('
'SELECT '
f'CASE WHEN {fltr} THEN \'set_A\' ELSE \'set_B\' END "${field_alias}",'
'SUM("candidate_spend") "$candidate-spend" '
'FROM "politics"."politician_spend" '
f'GROUP BY "${field_alias}"'
f') "sq0" '
f"ORDER BY \"${field_alias}\" "
"LIMIT 200000",
str(queries[0]),
)

def test_apply_dimension_filter_on_mapped_dimension_field_filters_in_both_nested_query(
self,
):
Expand Down
31 changes: 31 additions & 0 deletions fireant/tests/queries/test_build_sets.py
Expand Up @@ -382,6 +382,8 @@ def test_dimension_filter_variations_with_sets(self):
for field_alias, fltr in [
('text', ds.fields.text.like("%abc%")),
('text', ds.fields.text.not_like("%abc%")),
('text', ds.fields.text.like("%abc%", "%cde%")),
('text', ds.fields.text.not_like("%abc%", "%cde%")),
('text', ds.fields.text.isin(["abc"])),
('text', ds.fields.text.notin(["abc"])),
('date', ds.fields.date.between('date1', 'date2')),
Expand Down Expand Up @@ -410,3 +412,32 @@ def test_dimension_filter_variations_with_sets(self):
"LIMIT 200000",
str(queries[0]),
)

def test_deeply_nested_dimension_filter_with_sets(self):
field_alias = 'text'
fltr = ds.fields.text.like(
fn.Concat(
fn.Upper(fn.Trim(fn.Concat('%ab', ds.fields.number))),
ds.fields.aggr_number,
fn.Concat(ds.fields.date.between('date1', 'date2'), 'c%'),
)
)

queries = (
ds.query.widget(f.Pandas(ds.fields.aggr_number))
.dimension(ds.fields[field_alias])
.filter(f.ResultSet(fltr, set_label='set_A', complement_label='set_B'))
.sql
)

self.assertEqual(len(queries), 1)
self.assertEqual(
"SELECT "
f"CASE WHEN {fltr} THEN 'set_A' ELSE 'set_B' END \"${field_alias}\","
'SUM("number") "$aggr_number" '
'FROM "test" '
f"GROUP BY \"${field_alias}\" "
f"ORDER BY \"${field_alias}\" "
"LIMIT 200000",
str(queries[0]),
)

0 comments on commit eab3026

Please sign in to comment.