Skip to content

Commit

Permalink
Optimization fixes and simplification
Browse files Browse the repository at this point in the history
  • Loading branch information
gl3nn committed Sep 11, 2020
1 parent 324ae1c commit 4994f8b
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 73 deletions.
42 changes: 20 additions & 22 deletions fireant/queries/builder/dataset_blender_query_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,31 +323,27 @@ def _blend_query(dimensions, metrics, orders, field_maps, queries):

reference = base_query._references[0] if base_query._references else None

if len(queries) == 1:
# Optimization step, we don't need to do any joining as there is only one query
blender_query = base_query
else:
blender_query = _perform_join_operations(
dimensions, base_query, base_field_map, join_queries, join_field_maps
)
blender_query = _perform_join_operations(
dimensions, base_query, base_field_map, join_queries, join_field_maps
)

# WARNING: In order to make complex fields work, the get_sql for each field is monkey patched in. This must
# happen here because a complex metric by definition references values selected from the dataset subqueries.
# WARNING: In order to make complex fields work, the get_sql for each field is monkey patched in. This must
# happen here because a complex metric by definition references values selected from the dataset subqueries.

for metric in find_dataset_fields(metrics):
subquery_field = _get_sq_field_for_blender_field(
metric, queries, field_maps, reference
)
metric.get_sql = subquery_field.get_sql
for metric in find_dataset_fields(metrics):
subquery_field = _get_sq_field_for_blender_field(
metric, queries, field_maps, reference
)
metric.get_sql = subquery_field.get_sql

sq_dimensions = [
_get_sq_field_for_blender_field(d, queries, field_maps) for d in dimensions
]
sq_metrics = [
_get_sq_field_for_blender_field(m, queries, field_maps, reference)
for m in metrics
]
blender_query = blender_query.select(*sq_dimensions).select(*sq_metrics)
sq_dimensions = [
_get_sq_field_for_blender_field(d, queries, field_maps) for d in dimensions
]
sq_metrics = [
_get_sq_field_for_blender_field(m, queries, field_maps, reference)
for m in metrics
]
blender_query = blender_query.select(*sq_dimensions).select(*sq_metrics)

for field, orientation in orders:
if any(dimension is field for dimension in dimensions):
Expand Down Expand Up @@ -450,7 +446,9 @@ def sql(self):
# Second map the dimensions and find the dimensions which are unique to a dataset. Include those.
# Also save for each dimension of which datasets it is part of.
dimensions_dataset_info = []
print("HERE")
for dimension in selected_blender_dimensions:
print(dimension)
dimension_dataset_info = []

for dataset_index, dataset in enumerate(datasets):
Expand Down
28 changes: 18 additions & 10 deletions fireant/tests/queries/test_build_data_blending.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@ def test_using_fields_from_single_dataset_reduced_to_dataset_query(self):

self.assertEqual(len(queries), 1)
self.assertEqual(
"SELECT "
'TRUNC("timestamp",\'DD\') "$timestamp",'
'SUM("votes") "$votes" '
'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$votes" "$votes" '
'FROM ('
'SELECT TRUNC("timestamp",\'DD\') "$timestamp",SUM("votes") "$votes" '
'FROM "politics"."politician" '
'GROUP BY "$timestamp" '
'GROUP BY "$timestamp"'
') "sq0" '
'ORDER BY "$timestamp" '
'LIMIT 200000',
str(queries[0]),
Expand Down Expand Up @@ -315,13 +316,15 @@ def test_apply_set_filter_for_dimension_that_is_also_being_fetched_in_tertiary_d

self.assertEqual(len(queries), 1)
self.assertEqual(
"SELECT "
'CASE WHEN "candidate_id"=12 THEN \'set(candidate_id=12)\' ELSE \'complement(candidate_id=12)\' END "$candidate-id",'
'SELECT "sq0"."$candidate-id" "$candidate-id","sq0"."$num_staff" "$num_staff" '
'FROM ('
'SELECT CASE WHEN "candidate_id"=12 THEN \'set(candidate_id=12)\' '
'ELSE \'complement(candidate_id=12)\' END "$candidate-id",'
'COUNT("staff_id") "$num_staff" '
'FROM "politics"."politician_staff" '
'GROUP BY "$candidate-id" '
'ORDER BY "$candidate-id" '
'LIMIT 200000',
'GROUP BY "$candidate-id"'
') "sq0" '
'ORDER BY "$candidate-id" LIMIT 200000',
str(queries[0]),
)

Expand Down Expand Up @@ -508,11 +511,16 @@ def test_dimension_filter_variations_with_sets_for_data_blending(self):

self.assertEqual(len(queries), 1)
self.assertEqual(
"SELECT "
f'"sq0"."${field_alias}" "${field_alias}",'
'"sq0"."$candidate-spend" "$candidate-spend" '
'FROM ('
'SELECT '
f'CASE WHEN {fltr} THEN \'set_A\' ELSE \'set_B\' END "${field_alias}",'
'SUM("candidate_spend") "$candidate-spend" '
'FROM "politics"."politician_spend" '
f'GROUP BY "${field_alias}" '
f'GROUP BY "${field_alias}"'
f') "sq0" '
f"ORDER BY \"${field_alias}\" "
"LIMIT 200000",
str(queries[0]),
Expand Down
130 changes: 89 additions & 41 deletions fireant/tests/queries/test_data_blending_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,9 @@ def test_select_only_a_metric_from_primary_dataset(

(query,) = sql
self.assertEqual(
"SELECT "
'"timestamp" "$timestamp",'
'SUM("metric") "$metric0" '
'FROM "test0" '
'GROUP BY "$timestamp" '
'ORDER BY "$timestamp" '
'LIMIT 200000',
'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$metric0" "$metric0" '
'FROM (SELECT "timestamp" "$timestamp",SUM("metric") "$metric0" FROM "test0" GROUP BY "$timestamp") "sq0" '
'ORDER BY "$timestamp" LIMIT 200000',
str(query),
)

Expand Down Expand Up @@ -523,13 +519,9 @@ def test_do_not_include_fields_with_conflicting_aliases_in_subqueries_unless_map

(query,) = sql
self.assertEqual(
"SELECT "
'"timestamp" "$timestamp",'
'SUM("metric") "$metric0" '
'FROM "test0" '
'GROUP BY "$timestamp" '
'ORDER BY "$timestamp" '
'LIMIT 200000',
'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$metric0" "$metric0" FROM '
'(SELECT "timestamp" "$timestamp",SUM("metric") "$metric0" FROM "test0" GROUP BY "$timestamp") "sq0" '
'ORDER BY "$timestamp" LIMIT 200000',
str(query),
)

Expand Down Expand Up @@ -890,6 +882,74 @@ def test_blended_references_with_order_by_on_unused_metric(self):
str(query_2),
)

def test_optimization_with_complex_blended_metric(self):
db = TestDatabase()
t0, t1 = Tables("test0", "test1")
primary_ds = DataSet(
table=t0,
database=db,
fields=[
Field(
"timestamp",
label="Timestamp",
definition=t0.timestamp,
data_type=DataType.date,
),
],
)
secondary_ds = DataSet(
table=t1,
database=db,
fields=[
Field(
"timestamp",
label="Timestamp",
definition=t1.timestamp,
data_type=DataType.date,
),
Field(
"other_metric_name",
label="Metric",
definition=fn.Sum(t1.metric),
data_type=DataType.number,
),
Field(
"metric_2",
label="Metric 2",
definition=fn.Sum(t1.metric_2),
data_type=DataType.number,
),
],
)
blend_ds = primary_ds.blend(secondary_ds).on(
{primary_ds.fields.timestamp: secondary_ds.fields.timestamp}
).extra_fields(
Field(
"blended_metric",
label="Blended Metric",
definition=secondary_ds.fields.other_metric_name / secondary_ds.fields.metric_2,
data_type=DataType.number,
)
)

query = (
blend_ds.query()
.dimension(blend_ds.fields.timestamp)
.widget(f.Widget(blend_ds.fields.blended_metric))
).sql[0]

self.assertEqual(
'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$other_metric_name"/"sq0"."$metric_2" "$blended_metric" '
'FROM ('
'SELECT "timestamp" "$timestamp",SUM("metric") "$other_metric_name",SUM("metric_2") "$metric_2" '
'FROM "test1" '
'GROUP BY "$timestamp"'
') "sq0" '
'ORDER BY "$timestamp" '
'LIMIT 200000',
str(query),
)

def test_blending_with_only_metric_filter_selected_in_secondary_dataset(self):
db = TestDatabase()
t0, t1 = Tables("test0", "test1")
Expand Down Expand Up @@ -1116,23 +1176,16 @@ def test_blending_with_share_operation_on_primary_metric(self):

(query_1, query_2) = sql
self.assertEqual(
"SELECT "
'"timestamp" "$timestamp",'
'SUM("metric") "$metric0" '
'FROM "test0" '
'GROUP BY "$timestamp" '
'ORDER BY "$timestamp" '
'LIMIT 200000',
'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$metric0" "$metric0" FROM '
'(SELECT "timestamp" "$timestamp",SUM("metric") "$metric0" FROM "test0" GROUP BY "$timestamp") "sq0" '
'ORDER BY "$timestamp" LIMIT 200000',
str(query_1),
)

self.assertEqual(
"SELECT "
"'_FIREANT_ROLLUP_VALUE_' \"$timestamp\","
'SUM("metric") "$metric0" '
'FROM "test0" '
'ORDER BY "$timestamp" '
'LIMIT 200000',
'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$metric0" "$metric0" FROM '
'(SELECT \'_FIREANT_ROLLUP_VALUE_\' "$timestamp",SUM("metric") "$metric0" FROM "test0") "sq0" '
'ORDER BY "$timestamp" LIMIT 200000',
str(query_2),
)

Expand Down Expand Up @@ -1197,23 +1250,16 @@ def test_blending_with_share_operation_on_secondary_metric(self):

(query_1, query_2) = sql
self.assertEqual(
"SELECT "
'"timestamp" "$timestamp",'
'SUM("metric") "$metric1" '
'FROM "test1" '
'GROUP BY "$timestamp" '
'ORDER BY "$timestamp" '
'LIMIT 200000',
'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$metric1" "$metric1" FROM '
'(SELECT "timestamp" "$timestamp",SUM("metric") "$metric1" FROM "test1" GROUP BY "$timestamp") "sq0" '
'ORDER BY "$timestamp" LIMIT 200000',
str(query_1),
)

self.assertEqual(
"SELECT "
"'_FIREANT_ROLLUP_VALUE_' \"$timestamp\","
'SUM("metric") "$metric1" '
'FROM "test1" '
'ORDER BY "$timestamp" '
'LIMIT 200000',
'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$metric1" "$metric1" FROM '
'(SELECT \'_FIREANT_ROLLUP_VALUE_\' "$timestamp",SUM("metric") "$metric1" FROM "test1") "sq0" '
'ORDER BY "$timestamp" LIMIT 200000',
str(query_2),
)

Expand Down Expand Up @@ -1420,7 +1466,9 @@ def test_selecting_just_one_metric_in_non_primary_dataset(self):
(query,) = blender.query().widget(ReactTable(blender.fields.only_metric2)).sql

self.assertEqual(
'SELECT "metric" "$metric2" FROM "test2" ORDER BY 1 LIMIT 200000',
'SELECT "sq0"."$metric2" "$only_metric2" '
'FROM (SELECT "metric" "$metric2" FROM "test2") "sq0" '
'ORDER BY 1 LIMIT 200000',
str(query),
)

Expand Down
6 changes: 6 additions & 0 deletions fireant/widgets/reacttable.py
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,12 @@ def transform(
if alias_selector(dimension.alias) not in hide_aliases
]

print("HEREREER")
print(result_df)
print("AH")
print(metric_aliases)
print(result_df[metric_aliases])

result_df = self.format_data_frame(result_df[metric_aliases])
result_df, is_pivoted, is_transposed = self.pivot_data_frame(result_df, pivot_dimensions, self.transpose)
dimension_columns = self.transform_index_column_headers(result_df, field_map, hide_aliases)
Expand Down

0 comments on commit 4994f8b

Please sign in to comment.