From 24270734ac7da55ed219806d50ae8101d0e0b6d5 Mon Sep 17 00:00:00 2001 From: Anton Lykov Date: Mon, 4 Mar 2024 22:53:18 +0800 Subject: [PATCH] [SPARK-47070] Fix invalid aggregation after subquery rewrite ## What changes were proposed in this pull request? **tl;dr** This PR fixes a bug related to an `exists` variable being lost after an incorrect subquery rewrite when `exists` is not used neither in grouping expressions nor in aggregate functions. We wrap such variable in `first()` agg func to not lose reference to it. **Motivation** Imagine we had a plan with a subquery: ``` Aggregate [a1#0] [CASE WHEN a1#0 IN (list#3999 []) THEN Hello ELSE Hi END AS strCol#13] : +- LocalRelation , [b1#3, b2#4, b3#5] + LocalRelation , [a1#0, a2#1, a3#2] ``` During correlated subquery rewrite, the rule `RewritePredicateSubquery` would rewrite expression `a1#0 IN (list#3999 [])` into `exists#12` and replace the subquery with `ExistenceJoin`, like so: ``` Aggregate [a1#0] [CASE WHEN exists#12 THEN Hello ELSE Hi END AS strCol#13] +- Join ExistenceJoin(exists#12), (a1#0 = b1#3) +- LocalRelation , [a1#0, a2#1, a3#2] +- LocalRelation , [b1#3, b2#4, b3#5] ``` Note that `exists#12` doesn't appear neither in the grouping expressions, nor is part of any aggregate function. This is an invalid aggregation. In particular, aggregate pushdown rule rewrite this plan into: ``` Project [CASE WHEN exists#12 THEN Hello WHEN true THEN Hi END AS strCol#13] +- AggregatePart [a1#0], true +- AggregatePart [a1#0], false +- Join ExistenceJoin(exists#12), (a1#0 = b1#3) :- AggregatePart [a1#0], false : +- LocalRelation , [a1#0, a2#1, a3#2] +- AggregatePart [b1#3], false +- LocalRelation , [b1#3, b2#4, b3#5] ``` **Solution** We fix the problem by wrapping such `exists` attributes in `first()` function, which is Spark's executable of `any_value()`. Note that such `exists` is always functionally determined by grouping keys, and thus wrapping it in any aggregate function preserving its unique value is safe. Specifically, we only wrap `exists` attributes if they are referenced among aggregate expressions, but NOT within an aggregate function or its filter. Note that a new `exists` attribute cannot appear in groupingExpressions. **Original proposal (NOT used)** The decision is to fix the bug in the `RewritePredicateSubquery` by enforcing the condition that newly introduced variables, if referenced among agg expressions, must either participate in aggregate functions, or appear in the grouping keys. With the fix, the plan after `RewritePredicateSubquery` will look like: ``` Aggregate [a1#0, exists#12] [CASE WHEN exists#12 THEN Hello ELSE Hi END AS strCol#13] +- Join ExistenceJoin(exists#12), (a1#0 = b1#3) +- LocalRelation , [a1#0, a2#1, a3#2] +- LocalRelation , [b1#3, b2#4, b3#5] ``` **NOTE:** It is still possible to manually construct ExistenceJoin (e.g via dsl) and an Aggregate on top of it that violate the condition. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Query tests Closes #45133 from anton5798/subquery-exists-agg. Authored-by: Anton Lykov Signed-off-by: Wenchen Fan --- .../sql/catalyst/optimizer/subquery.scala | 65 ++++- .../exists-subquery/exists-aggregate.sql.out | 131 ++++++++++ .../subquery/in-subquery/in-group-by.sql.out | 237 ++++++++++++++++++ .../exists-subquery/exists-aggregate.sql | 39 ++- .../subquery/in-subquery/in-group-by.sql | 71 ++++++ .../exists-subquery/exists-aggregate.sql.out | 83 ++++++ .../subquery/in-subquery/in-group-by.sql.out | 117 +++++++++ 7 files changed, 736 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala index ee5b24f76ebfd..796c10b470120 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala @@ -248,24 +248,74 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { case u: UnaryNode if u.expressions.exists( SubqueryExpression.hasInOrCorrelatedExistsSubquery) => var newChild = u.child - u.mapExpressions(expr => { - val (newExpr, p) = rewriteExistentialExpr(Seq(expr), newChild) + var introducedAttrs = Seq.empty[Attribute] + val updatedNode = u.mapExpressions(expr => { + val (newExpr, p, newAttrs) = rewriteExistentialExprWithAttrs(Seq(expr), newChild) newChild = p + introducedAttrs ++= newAttrs // The newExpr can not be None newExpr.get }).withNewChildren(Seq(newChild)) + updatedNode match { + case a: Aggregate => + // If we have introduced new `exists`-attributes that are referenced by + // aggregateExpressions within a non-aggregateFunction expression, we wrap them in + // first() aggregate function. first() is Spark's executable version of any_value() + // aggregate function. + // We do this to keep the aggregation valid, i.e avoid references outside of aggregate + // functions that are not in grouping expressions. + // Note that the same `exists` attr will never appear in groupingExpressions due to + // PullOutGroupingExpressions rule. + // Also note: the value of `exists` is functionally determined by grouping expressions, + // so applying any aggregate function is semantically safe. + val aggFunctionReferences = a.aggregateExpressions. + flatMap(extractAggregateExpressions). + flatMap(_.references).toSet + val nonAggFuncReferences = + a.aggregateExpressions.flatMap(_.references).filterNot(aggFunctionReferences.contains) + val toBeWrappedExistsAttrs = introducedAttrs.filter(nonAggFuncReferences.contains) + + // Replace all eligible `exists` by `First(exists)` among aggregateExpressions. + val newAggregateExpressions = a.aggregateExpressions.map { aggExpr => + aggExpr.transformUp { + case attr: Attribute if toBeWrappedExistsAttrs.contains(attr) => + new First(attr).toAggregateExpression() + }.asInstanceOf[NamedExpression] + } + a.copy(aggregateExpressions = newAggregateExpressions) + case _ => updatedNode + } + } + + /** + * Extract all aggregate expressions from the expression tree routed at `expr`. + */ + private def extractAggregateExpressions(expr: Expression): Seq[AggregateExpression] = { + expr match { + case a: AggregateExpression => Seq(a) + case e: Expression => e.children.flatMap(extractAggregateExpressions) + } } /** * Given a predicate expression and an input plan, it rewrites any embedded existential sub-query - * into an existential join. It returns the rewritten expression together with the updated plan. + * into an existential join. It returns the rewritten expression together with the updated plan, + * as well as the newly introduced attributes. * Currently, it does not support NOT IN nested inside a NOT expression. This case is blocked in * the Analyzer. */ private def rewriteExistentialExpr( - exprs: Seq[Expression], - plan: LogicalPlan): (Option[Expression], LogicalPlan) = { + exprs: Seq[Expression], + plan: LogicalPlan): (Option[Expression], LogicalPlan) = { + val (newExpr, newPlan, _) = rewriteExistentialExprWithAttrs(exprs, plan) + (newExpr, newPlan) + } + + private def rewriteExistentialExprWithAttrs( + exprs: Seq[Expression], + plan: LogicalPlan): (Option[Expression], LogicalPlan, Seq[Attribute]) = { var newPlan = plan + val introducedAttrs = ArrayBuffer.empty[Attribute] val newExprs = exprs.map { e => e.transformDownWithPruning(_.containsAnyPattern(EXISTS_SUBQUERY, IN_SUBQUERY)) { case Exists(sub, _, _, conditions, subHint) => @@ -275,6 +325,7 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { newPlan = buildJoin(newPlan, rewriteDomainJoinsIfPresent(newPlan, sub, newCondition), existenceJoin, newCondition, subHint) + introducedAttrs += exists exists case Not(InSubquery(values, ListQuery(sub, _, _, _, conditions, subHint))) => val exists = AttributeReference("exists", BooleanType, nullable = false)() @@ -299,6 +350,7 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { newPlan = Join(newPlan, rewriteDomainJoinsIfPresent(newPlan, newSub, Some(finalJoinCond)), ExistenceJoin(exists), Some(finalJoinCond), joinHint) + introducedAttrs += exists Not(exists) case InSubquery(values, ListQuery(sub, _, _, _, conditions, subHint)) => val exists = AttributeReference("exists", BooleanType, nullable = false)() @@ -309,10 +361,11 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { val joinHint = JoinHint(None, subHint) newPlan = Join(newPlan, rewriteDomainJoinsIfPresent(newPlan, newSub, newConditions), ExistenceJoin(exists), newConditions, joinHint) + introducedAttrs += exists exists } } - (newExprs.reduceOption(And), newPlan) + (newExprs.reduceOption(And), newPlan, introducedAttrs.toSeq) } } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-aggregate.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-aggregate.sql.out index 84caaf3c8175d..b4c9632462527 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-aggregate.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/exists-subquery/exists-aggregate.sql.out @@ -374,3 +374,134 @@ Project [emp_name#x] +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +- SubqueryAlias EMP +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + + +-- !query +SELECT + emp.dept_id, + EXISTS (SELECT dept.dept_id FROM dept) +FROM emp +GROUP BY emp.dept_id ORDER BY emp.dept_id +-- !query analysis +Sort [dept_id#x ASC NULLS FIRST], true ++- Aggregate [dept_id#x], [dept_id#x, exists#x [] AS exists()#x] + : +- Project [dept_id#x] + : +- SubqueryAlias dept + : +- View (`DEPT`, [dept_id#x, dept_name#x, state#x]) + : +- Project [cast(dept_id#x as int) AS dept_id#x, cast(dept_name#x as string) AS dept_name#x, cast(state#x as string) AS state#x] + : +- Project [dept_id#x, dept_name#x, state#x] + : +- SubqueryAlias DEPT + : +- LocalRelation [dept_id#x, dept_name#x, state#x] + +- SubqueryAlias emp + +- View (`EMP`, [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x]) + +- Project [cast(id#x as int) AS id#x, cast(emp_name#x as string) AS emp_name#x, cast(hiredate#x as date) AS hiredate#x, cast(salary#x as double) AS salary#x, cast(dept_id#x as int) AS dept_id#x] + +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- SubqueryAlias EMP + +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + + +-- !query +SELECT + emp.dept_id, + EXISTS (SELECT dept.dept_id FROM dept) +FROM emp +GROUP BY emp.dept_id ORDER BY emp.dept_id +-- !query analysis +Sort [dept_id#x ASC NULLS FIRST], true ++- Aggregate [dept_id#x], [dept_id#x, exists#x [] AS exists()#x] + : +- Project [dept_id#x] + : +- SubqueryAlias dept + : +- View (`DEPT`, [dept_id#x, dept_name#x, state#x]) + : +- Project [cast(dept_id#x as int) AS dept_id#x, cast(dept_name#x as string) AS dept_name#x, cast(state#x as string) AS state#x] + : +- Project [dept_id#x, dept_name#x, state#x] + : +- SubqueryAlias DEPT + : +- LocalRelation [dept_id#x, dept_name#x, state#x] + +- SubqueryAlias emp + +- View (`EMP`, [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x]) + +- Project [cast(id#x as int) AS id#x, cast(emp_name#x as string) AS emp_name#x, cast(hiredate#x as date) AS hiredate#x, cast(salary#x as double) AS salary#x, cast(dept_id#x as int) AS dept_id#x] + +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- SubqueryAlias EMP + +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + + +-- !query +SELECT + emp.dept_id, + NOT EXISTS (SELECT dept.dept_id FROM dept) +FROM emp +GROUP BY emp.dept_id ORDER BY emp.dept_id +-- !query analysis +Sort [dept_id#x ASC NULLS FIRST], true ++- Aggregate [dept_id#x], [dept_id#x, NOT exists#x [] AS (NOT exists())#x] + : +- Project [dept_id#x] + : +- SubqueryAlias dept + : +- View (`DEPT`, [dept_id#x, dept_name#x, state#x]) + : +- Project [cast(dept_id#x as int) AS dept_id#x, cast(dept_name#x as string) AS dept_name#x, cast(state#x as string) AS state#x] + : +- Project [dept_id#x, dept_name#x, state#x] + : +- SubqueryAlias DEPT + : +- LocalRelation [dept_id#x, dept_name#x, state#x] + +- SubqueryAlias emp + +- View (`EMP`, [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x]) + +- Project [cast(id#x as int) AS id#x, cast(emp_name#x as string) AS emp_name#x, cast(hiredate#x as date) AS hiredate#x, cast(salary#x as double) AS salary#x, cast(dept_id#x as int) AS dept_id#x] + +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- SubqueryAlias EMP + +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + + +-- !query +SELECT + emp.dept_id, + SUM( + CASE WHEN EXISTS (SELECT dept.dept_id FROM dept WHERE dept.dept_id = emp.dept_id) THEN 1 + ELSE 0 END) +FROM emp +GROUP BY emp.dept_id ORDER BY emp.dept_id +-- !query analysis +Sort [dept_id#x ASC NULLS FIRST], true ++- Aggregate [dept_id#x], [dept_id#x, sum(CASE WHEN exists#x [dept_id#x] THEN 1 ELSE 0 END) AS sum(CASE WHEN exists(dept_id) THEN 1 ELSE 0 END)#xL] + : +- Project [dept_id#x] + : +- Filter (dept_id#x = outer(dept_id#x)) + : +- SubqueryAlias dept + : +- View (`DEPT`, [dept_id#x, dept_name#x, state#x]) + : +- Project [cast(dept_id#x as int) AS dept_id#x, cast(dept_name#x as string) AS dept_name#x, cast(state#x as string) AS state#x] + : +- Project [dept_id#x, dept_name#x, state#x] + : +- SubqueryAlias DEPT + : +- LocalRelation [dept_id#x, dept_name#x, state#x] + +- SubqueryAlias emp + +- View (`EMP`, [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x]) + +- Project [cast(id#x as int) AS id#x, cast(emp_name#x as string) AS emp_name#x, cast(hiredate#x as date) AS hiredate#x, cast(salary#x as double) AS salary#x, cast(dept_id#x as int) AS dept_id#x] + +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- SubqueryAlias EMP + +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + + +-- !query +SELECT + cast(EXISTS (SELECT id FROM dept where dept.dept_id = emp.dept_id) AS int) +FROM emp +GROUP BY + cast(EXISTS (SELECT id FROM dept where dept.dept_id = emp.dept_id) AS int) +-- !query analysis +Aggregate [cast(exists#x [id#x && dept_id#x] as int)], [cast(exists#x [id#x && dept_id#x] as int) AS CAST(exists(id, dept_id) AS INT)#x] +: :- Project [outer(id#x)] +: : +- Filter (dept_id#x = outer(dept_id#x)) +: : +- SubqueryAlias dept +: : +- View (`DEPT`, [dept_id#x, dept_name#x, state#x]) +: : +- Project [cast(dept_id#x as int) AS dept_id#x, cast(dept_name#x as string) AS dept_name#x, cast(state#x as string) AS state#x] +: : +- Project [dept_id#x, dept_name#x, state#x] +: : +- SubqueryAlias DEPT +: : +- LocalRelation [dept_id#x, dept_name#x, state#x] +: +- Project [outer(id#x)] +: +- Filter (dept_id#x = outer(dept_id#x)) +: +- SubqueryAlias dept +: +- View (`DEPT`, [dept_id#x, dept_name#x, state#x]) +: +- Project [cast(dept_id#x as int) AS dept_id#x, cast(dept_name#x as string) AS dept_name#x, cast(state#x as string) AS state#x] +: +- Project [dept_id#x, dept_name#x, state#x] +: +- SubqueryAlias DEPT +: +- LocalRelation [dept_id#x, dept_name#x, state#x] ++- SubqueryAlias emp + +- View (`EMP`, [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x]) + +- Project [cast(id#x as int) AS id#x, cast(emp_name#x as string) AS emp_name#x, cast(hiredate#x as date) AS hiredate#x, cast(salary#x as double) AS salary#x, cast(dept_id#x as int) AS dept_id#x] + +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- SubqueryAlias EMP + +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-group-by.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-group-by.sql.out index 02d22d8b081eb..2c5e9eb6733d9 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/in-subquery/in-group-by.sql.out @@ -674,3 +674,240 @@ Project [t1a#x] +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] +- SubqueryAlias t1 +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + + +-- !query +SELECT + t1.t1a, + t1.t1a IN (SELECT t2a FROM t2) as v1 +FROM t1 +GROUP BY t1.t1a ORDER BY t1.t1a +-- !query analysis +Sort [t1a#x ASC NULLS FIRST], true ++- Aggregate [t1a#x], [t1a#x, t1a#x IN (list#x []) AS v1#x] + : +- Project [t2a#x] + : +- SubqueryAlias t2 + : +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) + : +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] + : +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] + : +- SubqueryAlias t2 + : +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] + +- SubqueryAlias t1 + +- View (`t1`, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x]) + +- Project [cast(t1a#x as string) AS t1a#x, cast(t1b#x as smallint) AS t1b#x, cast(t1c#x as int) AS t1c#x, cast(t1d#xL as bigint) AS t1d#xL, cast(t1e#x as float) AS t1e#x, cast(t1f#x as double) AS t1f#x, cast(t1g#x as decimal(4,0)) AS t1g#x, cast(t1h#x as timestamp) AS t1h#x, cast(t1i#x as date) AS t1i#x] + +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + +- SubqueryAlias t1 + +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + + +-- !query +SELECT + count(cast(t1.t1a IN (SELECT t2a FROM t2) as INT)), + sum(cast(t1.t1b NOT IN (SELECT t2b FROM t2) as INT)) +FROM t1 +-- !query analysis +Aggregate [count(cast(t1a#x IN (list#x []) as int)) AS count(CAST((t1a IN (listquery())) AS INT))#xL, sum(cast(NOT t1b#x IN (list#x []) as int)) AS sum(CAST((NOT (t1b IN (listquery()))) AS INT))#xL] +: :- Project [t2a#x] +: : +- SubqueryAlias t2 +: : +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) +: : +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] +: : +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +: : +- SubqueryAlias t2 +: : +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +: +- Project [t2b#x] +: +- SubqueryAlias t2 +: +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) +: +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] +: +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +: +- SubqueryAlias t2 +: +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] ++- SubqueryAlias t1 + +- View (`t1`, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x]) + +- Project [cast(t1a#x as string) AS t1a#x, cast(t1b#x as smallint) AS t1b#x, cast(t1c#x as int) AS t1c#x, cast(t1d#xL as bigint) AS t1d#xL, cast(t1e#x as float) AS t1e#x, cast(t1f#x as double) AS t1f#x, cast(t1g#x as decimal(4,0)) AS t1g#x, cast(t1h#x as timestamp) AS t1h#x, cast(t1i#x as date) AS t1i#x] + +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + +- SubqueryAlias t1 + +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + + +-- !query +SELECT + agg_results.t1a, + COUNT(*) + FROM (SELECT t1.t1a FROM t1 WHERE t1.t1a IN (SELECT t2a FROM t2)) AS agg_results +GROUP BY agg_results.t1a ORDER BY agg_results.t1a +-- !query analysis +Sort [t1a#x ASC NULLS FIRST], true ++- Aggregate [t1a#x], [t1a#x, count(1) AS count(1)#xL] + +- SubqueryAlias agg_results + +- Project [t1a#x] + +- Filter t1a#x IN (list#x []) + : +- Project [t2a#x] + : +- SubqueryAlias t2 + : +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) + : +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] + : +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] + : +- SubqueryAlias t2 + : +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] + +- SubqueryAlias t1 + +- View (`t1`, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x]) + +- Project [cast(t1a#x as string) AS t1a#x, cast(t1b#x as smallint) AS t1b#x, cast(t1c#x as int) AS t1c#x, cast(t1d#xL as bigint) AS t1d#xL, cast(t1e#x as float) AS t1e#x, cast(t1f#x as double) AS t1f#x, cast(t1g#x as decimal(4,0)) AS t1g#x, cast(t1h#x as timestamp) AS t1h#x, cast(t1i#x as date) AS t1i#x] + +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + +- SubqueryAlias t1 + +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + + +-- !query +SELECT + t1.t1a, + CASE + WHEN t1.t1a IN (SELECT t2a FROM t2) THEN 10 + ELSE -10 + END AS v1 +FROM t1 +GROUP BY t1.t1a +ORDER BY t1.t1a +-- !query analysis +Sort [t1a#x ASC NULLS FIRST], true ++- Aggregate [t1a#x], [t1a#x, CASE WHEN t1a#x IN (list#x []) THEN 10 ELSE -10 END AS v1#x] + : +- Project [t2a#x] + : +- SubqueryAlias t2 + : +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) + : +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] + : +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] + : +- SubqueryAlias t2 + : +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] + +- SubqueryAlias t1 + +- View (`t1`, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x]) + +- Project [cast(t1a#x as string) AS t1a#x, cast(t1b#x as smallint) AS t1b#x, cast(t1c#x as int) AS t1c#x, cast(t1d#xL as bigint) AS t1d#xL, cast(t1e#x as float) AS t1e#x, cast(t1f#x as double) AS t1f#x, cast(t1g#x as decimal(4,0)) AS t1g#x, cast(t1h#x as timestamp) AS t1h#x, cast(t1i#x as date) AS t1i#x] + +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + +- SubqueryAlias t1 + +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + + +-- !query +SELECT + t1.t1c, + SUM(CASE + WHEN t1.t1c IN (SELECT t2c FROM t2) THEN 10 + ELSE -10 + END) AS v1, + SUM(CASE + WHEN t1.t1d IN (SELECT t2c FROM t2) THEN 10 + ELSE -10 + END) AS v2, + t1.t1c + 10 IN (SELECT t2c + 2 FROM t2) AS v3, + count(t1.t1c) as ct, + count(t1.t1d) +FROM t1 +GROUP BY t1.t1c +ORDER BY t1.t1c +-- !query analysis +Sort [t1c#x ASC NULLS FIRST], true ++- Aggregate [t1c#x], [t1c#x, sum(CASE WHEN t1c#x IN (list#x []) THEN 10 ELSE -10 END) AS v1#xL, sum(CASE WHEN t1d#xL IN (list#x []) THEN 10 ELSE -10 END) AS v2#xL, (t1c#x + 10) IN (list#x []) AS v3#x, count(t1c#x) AS ct#xL, count(t1d#xL) AS count(t1d)#xL] + : :- Project [t2c#x] + : : +- SubqueryAlias t2 + : : +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) + : : +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] + : : +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] + : : +- SubqueryAlias t2 + : : +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] + : :- Project [cast(t2c#x as bigint) AS t2c#xL] + : : +- Project [t2c#x] + : : +- SubqueryAlias t2 + : : +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) + : : +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] + : : +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] + : : +- SubqueryAlias t2 + : : +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] + : +- Project [(t2c#x + 2) AS (t2c + 2)#x] + : +- SubqueryAlias t2 + : +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) + : +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] + : +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] + : +- SubqueryAlias t2 + : +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] + +- SubqueryAlias t1 + +- View (`t1`, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x]) + +- Project [cast(t1a#x as string) AS t1a#x, cast(t1b#x as smallint) AS t1b#x, cast(t1c#x as int) AS t1c#x, cast(t1d#xL as bigint) AS t1d#xL, cast(t1e#x as float) AS t1e#x, cast(t1f#x as double) AS t1f#x, cast(t1g#x as decimal(4,0)) AS t1g#x, cast(t1h#x as timestamp) AS t1h#x, cast(t1i#x as date) AS t1i#x] + +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + +- SubqueryAlias t1 + +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + + +-- !query +SELECT + SUM(CASE + WHEN t1.t1c IN (SELECT t2c FROM t2) THEN 10 + ELSE -10 + END) AS v1, + count(t1.t1c) as ct +FROM t1 +-- !query analysis +Aggregate [sum(CASE WHEN t1c#x IN (list#x []) THEN 10 ELSE -10 END) AS v1#xL, count(t1c#x) AS ct#xL] +: +- Project [t2c#x] +: +- SubqueryAlias t2 +: +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) +: +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] +: +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +: +- SubqueryAlias t2 +: +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] ++- SubqueryAlias t1 + +- View (`t1`, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x]) + +- Project [cast(t1a#x as string) AS t1a#x, cast(t1b#x as smallint) AS t1b#x, cast(t1c#x as int) AS t1c#x, cast(t1d#xL as bigint) AS t1d#xL, cast(t1e#x as float) AS t1e#x, cast(t1f#x as double) AS t1f#x, cast(t1g#x as decimal(4,0)) AS t1g#x, cast(t1h#x as timestamp) AS t1h#x, cast(t1i#x as date) AS t1i#x] + +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + +- SubqueryAlias t1 + +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + + +-- !query +SELECT + cast(t1a in (select t2a from t2) as int) + 1 as groupExpr, + sum(cast(t1a in (select t2a from t2) as int) + 1) as aggExpr, + cast(t1a in (select t2a from t2) as int) + 1 + cast(exists (select t2a from t2) as int) + as complexExpr +FROM t1 +GROUP BY + cast(t1a in (select t2a from t2) as int) + 1 +-- !query analysis +Aggregate [(cast(t1a#x IN (list#x []) as int) + 1)], [(cast(t1a#x IN (list#x []) as int) + 1) AS groupExpr#x, sum((cast(t1a#x IN (list#x []) as int) + 1)) AS aggExpr#xL, ((cast(t1a#x IN (list#x []) as int) + 1) + cast(exists#x [] as int)) AS complexExpr#x] +: :- Project [t2a#x] +: : +- SubqueryAlias t2 +: : +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) +: : +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] +: : +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +: : +- SubqueryAlias t2 +: : +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +: :- Project [t2a#x] +: : +- SubqueryAlias t2 +: : +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) +: : +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] +: : +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +: : +- SubqueryAlias t2 +: : +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +: :- Project [t2a#x] +: : +- SubqueryAlias t2 +: : +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) +: : +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] +: : +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +: : +- SubqueryAlias t2 +: : +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +: :- Project [t2a#x] +: : +- SubqueryAlias t2 +: : +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) +: : +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] +: : +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +: : +- SubqueryAlias t2 +: : +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +: +- Project [t2a#x] +: +- SubqueryAlias t2 +: +- View (`t2`, [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x]) +: +- Project [cast(t2a#x as string) AS t2a#x, cast(t2b#x as smallint) AS t2b#x, cast(t2c#x as int) AS t2c#x, cast(t2d#xL as bigint) AS t2d#xL, cast(t2e#x as float) AS t2e#x, cast(t2f#x as double) AS t2f#x, cast(t2g#x as decimal(4,0)) AS t2g#x, cast(t2h#x as timestamp) AS t2h#x, cast(t2i#x as date) AS t2i#x] +: +- Project [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] +: +- SubqueryAlias t2 +: +- LocalRelation [t2a#x, t2b#x, t2c#x, t2d#xL, t2e#x, t2f#x, t2g#x, t2h#x, t2i#x] ++- SubqueryAlias t1 + +- View (`t1`, [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x]) + +- Project [cast(t1a#x as string) AS t1a#x, cast(t1b#x as smallint) AS t1b#x, cast(t1c#x as int) AS t1c#x, cast(t1d#xL as bigint) AS t1d#xL, cast(t1e#x as float) AS t1e#x, cast(t1f#x as double) AS t1f#x, cast(t1g#x as decimal(4,0)) AS t1g#x, cast(t1h#x as timestamp) AS t1h#x, cast(t1i#x as date) AS t1i#x] + +- Project [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] + +- SubqueryAlias t1 + +- LocalRelation [t1a#x, t1b#x, t1c#x, t1d#xL, t1e#x, t1f#x, t1g#x, t1h#x, t1i#x] diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-aggregate.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-aggregate.sql index 9797acc5561bb..9dc4ed30fa00a 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-aggregate.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-aggregate.sql @@ -134,4 +134,41 @@ WHERE EXISTS ( select max(tt2.id) from EMP as tt2 where tt1.emp_name is null -); \ No newline at end of file +); + +-- Plain exists subquery with a top-level aggregation +SELECT + emp.dept_id, + EXISTS (SELECT dept.dept_id FROM dept) +FROM emp +GROUP BY emp.dept_id ORDER BY emp.dept_id; + +-- Correlated exists subquery with a top-level aggregation +SELECT + emp.dept_id, + EXISTS (SELECT dept.dept_id FROM dept) +FROM emp +GROUP BY emp.dept_id ORDER BY emp.dept_id; + +-- Correlated exists subquery with a top-level aggregation +SELECT + emp.dept_id, + NOT EXISTS (SELECT dept.dept_id FROM dept) +FROM emp +GROUP BY emp.dept_id ORDER BY emp.dept_id; + +-- Correlated exists subquery with a top-level aggregation +SELECT + emp.dept_id, + SUM( + CASE WHEN EXISTS (SELECT dept.dept_id FROM dept WHERE dept.dept_id = emp.dept_id) THEN 1 + ELSE 0 END) +FROM emp +GROUP BY emp.dept_id ORDER BY emp.dept_id; + +---- Grouping expression contains a subquery +SELECT + cast(EXISTS (SELECT id FROM dept where dept.dept_id = emp.dept_id) AS int) +FROM emp +GROUP BY + cast(EXISTS (SELECT id FROM dept where dept.dept_id = emp.dept_id) AS int) diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql index 92f76939f2aa2..3161ac148e221 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql @@ -246,3 +246,74 @@ select t1a from t1 where t1f IN (SELECT RANK() OVER (partition by t3c order by t2b) as s FROM t2, t3 where t2.t2c = t3.t3c and t2.t2a < t1.t1a); + +-- Plain in-subquery with a top-level aggregation +SELECT + t1.t1a, + t1.t1a IN (SELECT t2a FROM t2) as v1 +FROM t1 +GROUP BY t1.t1a ORDER BY t1.t1a; + +-- Aggregate function over expression with subquery, without explicit GROUP BY, with NOT IN +SELECT + count(cast(t1.t1a IN (SELECT t2a FROM t2) as INT)), + sum(cast(t1.t1b NOT IN (SELECT t2b FROM t2) as INT)) +FROM t1; + +-- Derived table from subquery +SELECT + agg_results.t1a, + COUNT(*) + FROM (SELECT t1.t1a FROM t1 WHERE t1.t1a IN (SELECT t2a FROM t2)) AS agg_results +GROUP BY agg_results.t1a ORDER BY agg_results.t1a; + +-- CASE statement with an in-subquery and aggregation +SELECT + t1.t1a, + CASE + WHEN t1.t1a IN (SELECT t2a FROM t2) THEN 10 + ELSE -10 + END AS v1 +FROM t1 +GROUP BY t1.t1a +ORDER BY t1.t1a; + +-- CASE statement with an in-subquery inside an agg function +SELECT + t1.t1c, + -- sums over t1.t1c + SUM(CASE + WHEN t1.t1c IN (SELECT t2c FROM t2) THEN 10 + ELSE -10 + END) AS v1, + -- sums over t1.t1d + SUM(CASE + WHEN t1.t1d IN (SELECT t2c FROM t2) THEN 10 + ELSE -10 + END) AS v2, + -- no agg function, uses t1.t1c + t1.t1c + 10 IN (SELECT t2c + 2 FROM t2) AS v3, + count(t1.t1c) as ct, + count(t1.t1d) +FROM t1 +GROUP BY t1.t1c +ORDER BY t1.t1c; + +-- CASE statement with an in-subquery inside an agg function, without group-by +SELECT + SUM(CASE + WHEN t1.t1c IN (SELECT t2c FROM t2) THEN 10 + ELSE -10 + END) AS v1, + count(t1.t1c) as ct +FROM t1; + +-- Group-by statement contains an in-subquery, and there's an additional exists in select clause. +SELECT + cast(t1a in (select t2a from t2) as int) + 1 as groupExpr, + sum(cast(t1a in (select t2a from t2) as int) + 1) as aggExpr, + cast(t1a in (select t2a from t2) as int) + 1 + cast(exists (select t2a from t2) as int) + as complexExpr +FROM t1 +GROUP BY + cast(t1a in (select t2a from t2) as int) + 1; diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-aggregate.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-aggregate.sql.out index af907b67df2f9..ddf90c1127cfd 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-aggregate.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-aggregate.sql.out @@ -219,3 +219,86 @@ emp 5 emp 6 - no dept emp 7 emp 8 + + +-- !query +SELECT + emp.dept_id, + EXISTS (SELECT dept.dept_id FROM dept) +FROM emp +GROUP BY emp.dept_id ORDER BY emp.dept_id +-- !query schema +struct +-- !query output +NULL true +10 true +20 true +30 true +70 true +100 true + + +-- !query +SELECT + emp.dept_id, + EXISTS (SELECT dept.dept_id FROM dept) +FROM emp +GROUP BY emp.dept_id ORDER BY emp.dept_id +-- !query schema +struct +-- !query output +NULL true +10 true +20 true +30 true +70 true +100 true + + +-- !query +SELECT + emp.dept_id, + NOT EXISTS (SELECT dept.dept_id FROM dept) +FROM emp +GROUP BY emp.dept_id ORDER BY emp.dept_id +-- !query schema +struct +-- !query output +NULL false +10 false +20 false +30 false +70 false +100 false + + +-- !query +SELECT + emp.dept_id, + SUM( + CASE WHEN EXISTS (SELECT dept.dept_id FROM dept WHERE dept.dept_id = emp.dept_id) THEN 1 + ELSE 0 END) +FROM emp +GROUP BY emp.dept_id ORDER BY emp.dept_id +-- !query schema +struct +-- !query output +NULL 0 +10 3 +20 1 +30 1 +70 1 +100 0 + + +-- !query +SELECT + cast(EXISTS (SELECT id FROM dept where dept.dept_id = emp.dept_id) AS int) +FROM emp +GROUP BY + cast(EXISTS (SELECT id FROM dept where dept.dept_id = emp.dept_id) AS int) +-- !query schema +struct +-- !query output +0 +1 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-group-by.sql.out index e056efa7d5609..42abf286d45dd 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-group-by.sql.out @@ -368,3 +368,120 @@ t1d t1e t1e t1e + + +-- !query +SELECT + t1.t1a, + t1.t1a IN (SELECT t2a FROM t2) as v1 +FROM t1 +GROUP BY t1.t1a ORDER BY t1.t1a +-- !query schema +struct +-- !query output +t1a false +t1b true +t1c true +t1d false +t1e true + + +-- !query +SELECT + count(cast(t1.t1a IN (SELECT t2a FROM t2) as INT)), + sum(cast(t1.t1b NOT IN (SELECT t2b FROM t2) as INT)) +FROM t1 +-- !query schema +struct +-- !query output +12 0 + + +-- !query +SELECT + agg_results.t1a, + COUNT(*) + FROM (SELECT t1.t1a FROM t1 WHERE t1.t1a IN (SELECT t2a FROM t2)) AS agg_results +GROUP BY agg_results.t1a ORDER BY agg_results.t1a +-- !query schema +struct +-- !query output +t1b 1 +t1c 1 +t1e 3 + + +-- !query +SELECT + t1.t1a, + CASE + WHEN t1.t1a IN (SELECT t2a FROM t2) THEN 10 + ELSE -10 + END AS v1 +FROM t1 +GROUP BY t1.t1a +ORDER BY t1.t1a +-- !query schema +struct +-- !query output +t1a -10 +t1b 10 +t1c 10 +t1d -10 +t1e 10 + + +-- !query +SELECT + t1.t1c, + SUM(CASE + WHEN t1.t1c IN (SELECT t2c FROM t2) THEN 10 + ELSE -10 + END) AS v1, + SUM(CASE + WHEN t1.t1d IN (SELECT t2c FROM t2) THEN 10 + ELSE -10 + END) AS v2, + t1.t1c + 10 IN (SELECT t2c + 2 FROM t2) AS v3, + count(t1.t1c) as ct, + count(t1.t1d) +FROM t1 +GROUP BY t1.t1c +ORDER BY t1.t1c +-- !query schema +struct +-- !query output +NULL -40 -20 false 0 4 +8 -20 -20 true 2 2 +12 20 -20 false 2 2 +16 40 -40 false 4 4 + + +-- !query +SELECT + SUM(CASE + WHEN t1.t1c IN (SELECT t2c FROM t2) THEN 10 + ELSE -10 + END) AS v1, + count(t1.t1c) as ct +FROM t1 +-- !query schema +struct +-- !query output +0 8 + + +-- !query +SELECT + cast(t1a in (select t2a from t2) as int) + 1 as groupExpr, + sum(cast(t1a in (select t2a from t2) as int) + 1) as aggExpr, + cast(t1a in (select t2a from t2) as int) + 1 + cast(exists (select t2a from t2) as int) + as complexExpr +FROM t1 +GROUP BY + cast(t1a in (select t2a from t2) as int) + 1 +-- !query schema +struct +-- !query output +1 7 2 +2 10 3