Optimization fixes and simplification

kayak · Sep 11, 2020 · 4994f8b · 4994f8b
1 parent 324ae1c
commit 4994f8b
Show file tree

Hide file tree

Showing 4 changed files with 133 additions and 73 deletions.
diff --git a/fireant/queries/builder/dataset_blender_query_builder.py b/fireant/queries/builder/dataset_blender_query_builder.py
@@ -323,31 +323,27 @@ def _blend_query(dimensions, metrics, orders, field_maps, queries):
 
     reference = base_query._references[0] if base_query._references else None
 
-    if len(queries) == 1:
-        # Optimization step, we don't need to do any joining as there is only one query
-        blender_query = base_query
-    else:
-        blender_query = _perform_join_operations(
-            dimensions, base_query, base_field_map, join_queries, join_field_maps
-        )
+    blender_query = _perform_join_operations(
+        dimensions, base_query, base_field_map, join_queries, join_field_maps
+    )
 
-        # WARNING: In order to make complex fields work, the get_sql for each field is monkey patched in. This must
-        # happen here because a complex metric by definition references values selected from the dataset subqueries.
+    # WARNING: In order to make complex fields work, the get_sql for each field is monkey patched in. This must
+    # happen here because a complex metric by definition references values selected from the dataset subqueries.
 
-        for metric in find_dataset_fields(metrics):
-            subquery_field = _get_sq_field_for_blender_field(
-                metric, queries, field_maps, reference
-            )
-            metric.get_sql = subquery_field.get_sql
+    for metric in find_dataset_fields(metrics):
+        subquery_field = _get_sq_field_for_blender_field(
+            metric, queries, field_maps, reference
+        )
+        metric.get_sql = subquery_field.get_sql
 
-        sq_dimensions = [
-            _get_sq_field_for_blender_field(d, queries, field_maps) for d in dimensions
-        ]
-        sq_metrics = [
-            _get_sq_field_for_blender_field(m, queries, field_maps, reference)
-            for m in metrics
-        ]
-        blender_query = blender_query.select(*sq_dimensions).select(*sq_metrics)
+    sq_dimensions = [
+        _get_sq_field_for_blender_field(d, queries, field_maps) for d in dimensions
+    ]
+    sq_metrics = [
+        _get_sq_field_for_blender_field(m, queries, field_maps, reference)
+        for m in metrics
+    ]
+    blender_query = blender_query.select(*sq_dimensions).select(*sq_metrics)
 
     for field, orientation in orders:
         if any(dimension is field for dimension in dimensions):
@@ -450,7 +446,9 @@ def sql(self):
         # Second map the dimensions and find the dimensions which are unique to a dataset. Include those.
         # Also save for each dimension of which datasets it is part of.
         dimensions_dataset_info = []
+        print("HERE")
         for dimension in selected_blender_dimensions:
+            print(dimension)
             dimension_dataset_info = []
 
             for dataset_index, dataset in enumerate(datasets):

diff --git a/fireant/tests/queries/test_build_data_blending.py b/fireant/tests/queries/test_build_data_blending.py
@@ -22,11 +22,12 @@ def test_using_fields_from_single_dataset_reduced_to_dataset_query(self):
 
         self.assertEqual(len(queries), 1)
         self.assertEqual(
-            "SELECT "
-            'TRUNC("timestamp",\'DD\') "$timestamp",'
-            'SUM("votes") "$votes" '
+            'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$votes" "$votes" '
+            'FROM ('
+            'SELECT TRUNC("timestamp",\'DD\') "$timestamp",SUM("votes") "$votes" '
             'FROM "politics"."politician" '
-            'GROUP BY "$timestamp" '
+            'GROUP BY "$timestamp"'
+            ') "sq0" '
             'ORDER BY "$timestamp" '
             'LIMIT 200000',
             str(queries[0]),
@@ -315,13 +316,15 @@ def test_apply_set_filter_for_dimension_that_is_also_being_fetched_in_tertiary_d
 
         self.assertEqual(len(queries), 1)
         self.assertEqual(
-            "SELECT "
-            'CASE WHEN "candidate_id"=12 THEN \'set(candidate_id=12)\' ELSE \'complement(candidate_id=12)\' END "$candidate-id",'
+            'SELECT "sq0"."$candidate-id" "$candidate-id","sq0"."$num_staff" "$num_staff" '
+            'FROM ('
+            'SELECT CASE WHEN "candidate_id"=12 THEN \'set(candidate_id=12)\' '
+            'ELSE \'complement(candidate_id=12)\' END "$candidate-id",'
             'COUNT("staff_id") "$num_staff" '
             'FROM "politics"."politician_staff" '
-            'GROUP BY "$candidate-id" '
-            'ORDER BY "$candidate-id" '
-            'LIMIT 200000',
+            'GROUP BY "$candidate-id"'
+            ') "sq0" '
+            'ORDER BY "$candidate-id" LIMIT 200000',
             str(queries[0]),
         )
 
@@ -508,11 +511,16 @@ def test_dimension_filter_variations_with_sets_for_data_blending(self):
 
                 self.assertEqual(len(queries), 1)
                 self.assertEqual(
+                    "SELECT "
+                    f'"sq0"."${field_alias}" "${field_alias}",'
+                    '"sq0"."$candidate-spend" "$candidate-spend" '
+                    'FROM ('
                     'SELECT '
                     f'CASE WHEN {fltr} THEN \'set_A\' ELSE \'set_B\' END "${field_alias}",'
                     'SUM("candidate_spend") "$candidate-spend" '
                     'FROM "politics"."politician_spend" '
-                    f'GROUP BY "${field_alias}" '
+                    f'GROUP BY "${field_alias}"'
+                    f') "sq0" '
                     f"ORDER BY \"${field_alias}\" "
                     "LIMIT 200000",
                     str(queries[0]),

diff --git a/fireant/tests/queries/test_data_blending_integration.py b/fireant/tests/queries/test_data_blending_integration.py
@@ -62,13 +62,9 @@ def test_select_only_a_metric_from_primary_dataset(
 
         (query,) = sql
         self.assertEqual(
-            "SELECT "
-            '"timestamp" "$timestamp",'
-            'SUM("metric") "$metric0" '
-            'FROM "test0" '
-            'GROUP BY "$timestamp" '
-            'ORDER BY "$timestamp" '
-            'LIMIT 200000',
+            'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$metric0" "$metric0" '
+            'FROM (SELECT "timestamp" "$timestamp",SUM("metric") "$metric0" FROM "test0" GROUP BY "$timestamp") "sq0" '
+            'ORDER BY "$timestamp" LIMIT 200000',
             str(query),
         )
 
@@ -523,13 +519,9 @@ def test_do_not_include_fields_with_conflicting_aliases_in_subqueries_unless_map
 
         (query,) = sql
         self.assertEqual(
-            "SELECT "
-            '"timestamp" "$timestamp",'
-            'SUM("metric") "$metric0" '
-            'FROM "test0" '
-            'GROUP BY "$timestamp" '
-            'ORDER BY "$timestamp" '
-            'LIMIT 200000',
+            'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$metric0" "$metric0" FROM '
+            '(SELECT "timestamp" "$timestamp",SUM("metric") "$metric0" FROM "test0" GROUP BY "$timestamp") "sq0" '
+            'ORDER BY "$timestamp" LIMIT 200000',
             str(query),
         )
 
@@ -890,6 +882,74 @@ def test_blended_references_with_order_by_on_unused_metric(self):
             str(query_2),
         )
 
+    def test_optimization_with_complex_blended_metric(self):
+        db = TestDatabase()
+        t0, t1 = Tables("test0", "test1")
+        primary_ds = DataSet(
+            table=t0,
+            database=db,
+            fields=[
+                Field(
+                    "timestamp",
+                    label="Timestamp",
+                    definition=t0.timestamp,
+                    data_type=DataType.date,
+                ),
+            ],
+        )
+        secondary_ds = DataSet(
+            table=t1,
+            database=db,
+            fields=[
+                Field(
+                    "timestamp",
+                    label="Timestamp",
+                    definition=t1.timestamp,
+                    data_type=DataType.date,
+                ),
+                Field(
+                    "other_metric_name",
+                    label="Metric",
+                    definition=fn.Sum(t1.metric),
+                    data_type=DataType.number,
+                ),
+                Field(
+                    "metric_2",
+                    label="Metric 2",
+                    definition=fn.Sum(t1.metric_2),
+                    data_type=DataType.number,
+                ),
+            ],
+        )
+        blend_ds = primary_ds.blend(secondary_ds).on(
+            {primary_ds.fields.timestamp: secondary_ds.fields.timestamp}
+        ).extra_fields(
+            Field(
+                "blended_metric",
+                label="Blended Metric",
+                definition=secondary_ds.fields.other_metric_name / secondary_ds.fields.metric_2,
+                data_type=DataType.number,
+            )
+        )
+
+        query = (
+            blend_ds.query()
+                .dimension(blend_ds.fields.timestamp)
+                .widget(f.Widget(blend_ds.fields.blended_metric))
+        ).sql[0]
+
+        self.assertEqual(
+            'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$other_metric_name"/"sq0"."$metric_2" "$blended_metric" '
+            'FROM ('
+            'SELECT "timestamp" "$timestamp",SUM("metric") "$other_metric_name",SUM("metric_2") "$metric_2" '
+            'FROM "test1" '
+            'GROUP BY "$timestamp"'
+            ') "sq0" '
+            'ORDER BY "$timestamp" '
+            'LIMIT 200000',
+            str(query),
+        )
+
     def test_blending_with_only_metric_filter_selected_in_secondary_dataset(self):
         db = TestDatabase()
         t0, t1 = Tables("test0", "test1")
@@ -1116,23 +1176,16 @@ def test_blending_with_share_operation_on_primary_metric(self):
 
         (query_1, query_2) = sql
         self.assertEqual(
-            "SELECT "
-            '"timestamp" "$timestamp",'
-            'SUM("metric") "$metric0" '
-            'FROM "test0" '
-            'GROUP BY "$timestamp" '
-            'ORDER BY "$timestamp" '
-            'LIMIT 200000',
+            'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$metric0" "$metric0" FROM '
+            '(SELECT "timestamp" "$timestamp",SUM("metric") "$metric0" FROM "test0" GROUP BY "$timestamp") "sq0" '
+            'ORDER BY "$timestamp" LIMIT 200000',
             str(query_1),
         )
 
         self.assertEqual(
-            "SELECT "
-            "'_FIREANT_ROLLUP_VALUE_' \"$timestamp\","
-            'SUM("metric") "$metric0" '
-            'FROM "test0" '
-            'ORDER BY "$timestamp" '
-            'LIMIT 200000',
+            'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$metric0" "$metric0" FROM '
+            '(SELECT \'_FIREANT_ROLLUP_VALUE_\' "$timestamp",SUM("metric") "$metric0" FROM "test0") "sq0" '
+            'ORDER BY "$timestamp" LIMIT 200000',
             str(query_2),
         )
 
@@ -1197,23 +1250,16 @@ def test_blending_with_share_operation_on_secondary_metric(self):
 
         (query_1, query_2) = sql
         self.assertEqual(
-            "SELECT "
-            '"timestamp" "$timestamp",'
-            'SUM("metric") "$metric1" '
-            'FROM "test1" '
-            'GROUP BY "$timestamp" '
-            'ORDER BY "$timestamp" '
-            'LIMIT 200000',
+            'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$metric1" "$metric1" FROM '
+            '(SELECT "timestamp" "$timestamp",SUM("metric") "$metric1" FROM "test1" GROUP BY "$timestamp") "sq0" '
+            'ORDER BY "$timestamp" LIMIT 200000',
             str(query_1),
         )
 
         self.assertEqual(
-            "SELECT "
-            "'_FIREANT_ROLLUP_VALUE_' \"$timestamp\","
-            'SUM("metric") "$metric1" '
-            'FROM "test1" '
-            'ORDER BY "$timestamp" '
-            'LIMIT 200000',
+            'SELECT "sq0"."$timestamp" "$timestamp","sq0"."$metric1" "$metric1" FROM '
+            '(SELECT \'_FIREANT_ROLLUP_VALUE_\' "$timestamp",SUM("metric") "$metric1" FROM "test1") "sq0" '
+            'ORDER BY "$timestamp" LIMIT 200000',
             str(query_2),
         )
 
@@ -1420,7 +1466,9 @@ def test_selecting_just_one_metric_in_non_primary_dataset(self):
         (query,) = blender.query().widget(ReactTable(blender.fields.only_metric2)).sql
 
         self.assertEqual(
-            'SELECT "metric" "$metric2" FROM "test2" ORDER BY 1 LIMIT 200000',
+            'SELECT "sq0"."$metric2" "$only_metric2" '
+            'FROM (SELECT "metric" "$metric2" FROM "test2") "sq0" '
+            'ORDER BY 1 LIMIT 200000',
             str(query),
         )
 

diff --git a/fireant/widgets/reacttable.py b/fireant/widgets/reacttable.py
@@ -741,6 +741,12 @@ def transform(
             if alias_selector(dimension.alias) not in hide_aliases
         ]
 
+        print("HEREREER")
+        print(result_df)
+        print("AH")
+        print(metric_aliases)
+        print(result_df[metric_aliases])
+
         result_df = self.format_data_frame(result_df[metric_aliases])
         result_df, is_pivoted, is_transposed = self.pivot_data_frame(result_df, pivot_dimensions, self.transpose)
         dimension_columns = self.transform_index_column_headers(result_df, field_map, hide_aliases)