Skip to content

Commit

Permalink
refactor(ir): simplify expressions by not storing dtype and name
Browse files Browse the repository at this point in the history
Expression classes should only provide user facing API and the
underlying operations should hold all data. This will enable a simpler
operation hierarchy (without intermediare expressions) wrapped with a
single user facing expression.

BREAKING CHANGE: The following are breaking changes due to simplifying expression internals
  - `ibis.expr.datatypes.DataType.scalar_type` and `DataType.column_type` factory
    methods have been removed, `DataType.scalar` and `DataType.column` class
    fields can be used to directly construct a corresponding expression instance
    (though prefer to use `operation.to_expr()`)
  - `ibis.expr.types.ValueExpr._name` and `ValueExpr._dtype`` fields are not
    accassible anymore. While these were not supposed to used directly now
    `ValueExpr.has_name()`, `ValueExpr.get_name()` and `ValueExpr.type()` methods
    are the only way to retrieve the expression's name and datatype.
  - `ibis.expr.operations.Node.output_type` is a property now not a method,
    decorate those methods with `@property`
  - `ibis.expr.operations.ValueOp` subclasses must define `output_shape` and
    `output_dtype` properties from now on (note the datatype abbreviation `dtype`
    in the property name)
  - `ibis.expr.rules.cast()`, `scalar_like()` and `array_like()` rules have been
    removed
  • Loading branch information
kszucs authored and cpcloud committed Apr 15, 2022
1 parent 54cc3c3 commit e929f85
Show file tree
Hide file tree
Showing 68 changed files with 1,081 additions and 837 deletions.
11 changes: 8 additions & 3 deletions docs/user_guide/design.md
Expand Up @@ -77,9 +77,14 @@ import ibis.expr.rules as rlz
from ibis.expr.operations import ValueOp

class Log(ValueOp):
arg = rlz.double # A double scalar or column
base = rlz.optional(rlz.double) # Optional argument, defaults to None
output_type = rlz.typeof('arg')
# A double scalar or column
arg = rlz.double
# Optional argument, defaults to None
base = rlz.optional(rlz.double)
# Output expression's datatype will correspond to arg's datatype
output_dtype = rlz.dtype_like('arg')
# Output expression will be scalar if arg is scalar, column otherwise
output_shape = rlz.shape_like('arg')
```

This class describes an operation called `Log` that takes one required
Expand Down
12 changes: 11 additions & 1 deletion ibis/backends/base/sql/alchemy/registry.py
Expand Up @@ -207,6 +207,13 @@ def _group_concat(t, expr):
return sa.func.group_concat(arg, sep)


def _alias(t, expr):
# just compile the underlying argument because the naming is handled
# by the translator for the top level expression
op = expr.op()
return t.translate(op.arg)


def _literal(t, expr):
dtype = expr.type()
value = expr.op().value
Expand Down Expand Up @@ -327,7 +334,9 @@ def _cumulative_to_window(translator, expr, window):

klass = _cumulative_to_reduction[type(op)]
new_op = klass(*op.args)
new_expr = expr._factory(new_op, name=expr._name)
new_expr = new_op.to_expr()
if expr.has_name():
new_expr = new_expr.name(expr.get_name())

if type(new_op) in translator._rewrites:
new_expr = translator._rewrites[type(new_op)](new_expr)
Expand Down Expand Up @@ -442,6 +451,7 @@ def _string_join(t, expr):


sqlalchemy_operation_registry: Dict[Any, Any] = {
ops.Alias: _alias,
ops.And: fixed_arity(sql.and_, 2),
ops.Or: fixed_arity(sql.or_, 2),
ops.Not: unary(sa.not_),
Expand Down
4 changes: 1 addition & 3 deletions ibis/backends/base/sql/alchemy/translator.py
Expand Up @@ -36,9 +36,7 @@ class AlchemyExprTranslator(ExprTranslator):
context_class = AlchemyContext

def name(self, translated, name, force=True):
if hasattr(translated, 'label'):
return translated.label(name)
return translated
return translated.label(name)

def get_sqla_type(self, data_type):
return to_sqla_type(data_type, type_map=self._type_map)
Expand Down
11 changes: 9 additions & 2 deletions ibis/backends/base/sql/compiler/query_builder.py
Expand Up @@ -525,8 +525,15 @@ class Compiler:
@classmethod
def make_context(cls, params=None):
params = params or {}
params = {expr.op(): value for expr, value in params.items()}
return cls.context_class(compiler=cls, params=params)

unaliased_params = {}
for expr, value in params.items():
op = expr.op()
if isinstance(op, ops.Alias):
op = op.arg.op()
unaliased_params[op] = value

return cls.context_class(compiler=cls, params=unaliased_params)

@classmethod
def to_ast(cls, expr, context=None):
Expand Down
22 changes: 14 additions & 8 deletions ibis/backends/base/sql/compiler/select_builder.py
Expand Up @@ -34,7 +34,7 @@ def get_result(self):
else:
op = ops.NotExistsSubquery(self.foreign_table, self.predicates)

expr_type = dt.boolean.column_type()
expr_type = dt.boolean.column
return expr_type(op)

def _visit(self, expr):
Expand Down Expand Up @@ -467,7 +467,8 @@ def _visit_select_expr(self, expr):
new_args.append(arg)

if not unchanged:
return expr._factory(type(op)(*new_args))
new_op = type(op)(*new_args)
return new_op.to_expr()
else:
return expr
else:
Expand Down Expand Up @@ -500,8 +501,11 @@ def _visit_select_Histogram(self, expr):
binwidth = op.binwidth
base = op.base

bucket = (op.arg - base) / binwidth
return bucket.floor().name(expr._name)
bucket = ((op.arg - base) / binwidth).floor()
if expr.has_name():
bucket = bucket.name(expr.get_name())

return bucket

def _analyze_filter_exprs(self):
# What's semantically contained in the filter predicates may need to be
Expand Down Expand Up @@ -543,10 +547,11 @@ def _visit_filter(self, expr):
left = self._visit_filter(op.left)
right = self._visit_filter(op.right)
unchanged = left is op.left and right is op.right
if not unchanged:
return expr._factory(type(op)(left, right))
else:
if unchanged:
return expr
else:
new_op = type(op)(left, right)
return new_op.to_expr()
elif isinstance(op, (ops.Any, ops.TableColumn, ops.Literal)):
return expr
elif isinstance(op, ops.ValueOp):
Expand All @@ -559,7 +564,8 @@ def _visit_filter(self, expr):
if new is not old:
unchanged = False
if not unchanged:
return expr._factory(type(op)(*visited))
new_op = type(op)(*visited)
return new_op.to_expr()
else:
return expr
else:
Expand Down
54 changes: 29 additions & 25 deletions ibis/backends/base/sql/compiler/translator.py
Expand Up @@ -13,6 +13,7 @@
operation_registry,
quote_identifier,
)
from ibis.expr.types.core import unnamed


class QueryContext:
Expand Down Expand Up @@ -200,6 +201,22 @@ def __init__(self, expr, context, named=False, permit_subquery=False):
# For now, governing whether the result will have a name
self.named = named

def _needs_name(self, expr):
if not self.named:
return False

op = expr.op()
if isinstance(op, ops.TableColumn):
# This column has been given an explicitly different name
return expr.get_name() != op.name

return expr.get_name() is not unnamed

def name(self, translated, name, force=True):
return '{} AS {}'.format(
translated, quote_identifier(name, force=force)
)

def get_result(self):
"""Compile SQL expression into a string."""
translated = self.translate(self.expr)
Expand All @@ -221,27 +238,6 @@ def add_operation(cls, operation, translate_function):
"""
cls._registry[operation] = translate_function

def _needs_name(self, expr):
if not self.named:
return False

op = expr.op()
if isinstance(op, ops.TableColumn):
# This column has been given an explicitly different name
if expr.get_name() != op.name:
return True
return False

if expr.get_name() is ir.core.unnamed:
return False

return True

def name(self, translated: str, name: str, force: bool = True) -> str:
return '{} AS {}'.format(
translated, quote_identifier(name, force=force)
)

def translate(self, expr):
# The operation node type the typed expression wraps
op = expr.op()
Expand Down Expand Up @@ -331,7 +327,11 @@ def _bucket(expr):
stmt = stmt.when(cmp(op.buckets[-1], op.arg), bucket_id)
bucket_id += 1

return stmt.end().name(expr._name)
result = stmt.end()
if expr.has_name():
result = result.name(expr.get_name())

return result


@rewrites(ops.CategoryLabel)
Expand All @@ -345,7 +345,11 @@ def _category_label(expr):
if op.nulls is not None:
stmt = stmt.else_(op.nulls)

return stmt.end().name(expr._name)
result = stmt.end()
if expr.has_name():
result = result.name(expr.get_name())

return result


@rewrites(ops.Any)
Expand All @@ -357,7 +361,7 @@ def _any_expand(expr):
@rewrites(ops.NotAny)
def _notany_expand(expr):
arg = expr.op().args[0]
return arg.max() == 0
return arg.max() == ibis.literal(0, type=arg.type())


@rewrites(ops.All)
Expand All @@ -369,7 +373,7 @@ def _all_expand(expr):
@rewrites(ops.NotAll)
def _notall_expand(expr):
arg = expr.op().args[0]
return arg.min() == 0
return arg.min() == ibis.literal(0, type=arg.type())


@rewrites(ops.Cast)
Expand Down
3 changes: 2 additions & 1 deletion ibis/backends/base/sql/ddl.py
Expand Up @@ -220,7 +220,8 @@ def _partitioned_by(self):
if self.partition is not None:
return 'PARTITIONED BY ({})'.format(
', '.join(
quote_identifier(expr._name) for expr in self.partition
quote_identifier(expr.get_name())
for expr in self.partition
)
)
return None
Expand Down
8 changes: 8 additions & 0 deletions ibis/backends/base/sql/registry/main.py
Expand Up @@ -8,6 +8,13 @@
from .literal import literal, null_literal


def alias(translator, expr):
# just compile the underlying argument because the naming is handled
# by the translator for the top level expression
op = expr.op()
return translator.translate(op.arg)


def fixed_arity(func_name, arity):
def formatter(translator, expr):
op = expr.op()
Expand Down Expand Up @@ -237,6 +244,7 @@ def hash(translator, expr):


operation_registry = {
ops.Alias: alias,
# Unary operations
ops.NotNull: not_null,
ops.IsNull: is_null,
Expand Down
4 changes: 3 additions & 1 deletion ibis/backends/base/sql/registry/window.py
Expand Up @@ -95,7 +95,9 @@ def cumulative_to_window(translator, expr, window):

klass = _cumulative_to_reduction[type(op)]
new_op = klass(*op.args)
new_expr = expr._factory(new_op, name=expr._name)
new_expr = new_op.to_expr()
if expr.has_name():
new_expr = new_expr.name(expr.get_name())

if type(new_op) in translator._rewrites:
new_expr = translator._rewrites[type(new_op)](new_expr)
Expand Down
10 changes: 10 additions & 0 deletions ibis/backends/clickhouse/registry.py
Expand Up @@ -9,6 +9,15 @@

from .identifiers import quote_identifier

# TODO(kszucs): should inherit operation registry from the base compiler


def _alias(translator, expr):
# just compile the underlying argument because the naming is handled
# by the translator for the top level expression
op = expr.op()
return translator.translate(op.arg)


def _cast(translator, expr):
from .client import ClickhouseDataType
Expand Down Expand Up @@ -624,6 +633,7 @@ def _string_right(translator, expr):


operation_registry = {
ops.Alias: _alias,
# Unary operations
ops.TypeOf: _unary('toTypeName'),
ops.IsNan: _unary('isNaN'),
Expand Down
7 changes: 7 additions & 0 deletions ibis/backends/dask/execution/generic.py
Expand Up @@ -157,6 +157,13 @@ def execute_node_value_list(op, _, **kwargs):
return [execute(arg, **kwargs) for arg in op.values]


@execute_node.register(ops.Alias, object)
def execute_alias_series(op, _, **kwargs):
# just compile the underlying argument because the naming is handled
# by the translator for the top level expression
return execute(op.arg, **kwargs)


@execute_node.register(ops.Arbitrary, dd.Series, (dd.Series, type(None)))
def execute_arbitrary_series_mask(op, data, mask, aggcontext=None, **kwargs):
"""
Expand Down
6 changes: 2 additions & 4 deletions ibis/backends/dask/execution/selection.py
Expand Up @@ -42,7 +42,7 @@ def compute_projection_scalar_expr(
timecontext: Optional[TimeContext] = None,
**kwargs,
):
name = expr._name
name = expr.get_name()
assert name is not None, 'Scalar selection name is None'

op = expr.op()
Expand Down Expand Up @@ -76,7 +76,7 @@ def compute_projection_column_expr(
timecontext: Optional[TimeContext],
**kwargs,
):
result_name = getattr(expr, '_name', None)
result_name = expr._safe_name
op = expr.op()
parent_table_op = parent.table.op()

Expand Down Expand Up @@ -111,8 +111,6 @@ def compute_projection_column_expr(

result = execute(expr, scope=scope, timecontext=timecontext, **kwargs)
result = coerce_to_output(result, expr, data.index)
assert result_name is not None, 'Column selection name is None'

return result


Expand Down
9 changes: 4 additions & 5 deletions ibis/backends/dask/execution/util.py
Expand Up @@ -92,9 +92,8 @@ def coerce_to_output(
Examples
--------
For dataframe outputs, see ``_coerce_to_dataframe``. Examples below use
pandas objects for legibility, but functionality is the same on dask
objects.
Examples below use pandas objects for legibility, but functionality is the
same on dask objects.
>>> coerce_to_output(pd.Series(1), expr)
0 1
Expand All @@ -111,7 +110,7 @@ def coerce_to_output(
0 [1, 2, 3]
Name: result, dtype: object
"""
result_name = expr.get_name()
result_name = expr._safe_name
dataframe_exprs = (
ir.DestructColumn,
ir.StructColumn,
Expand All @@ -125,7 +124,7 @@ def coerce_to_output(
elif isinstance(result, (pd.Series, dd.Series)):
# Series from https://github.com/ibis-project/ibis/issues/2711
return result.rename(result_name)
elif isinstance(expr.op(), ops.Reduction):
elif isinstance(expr, ir.ScalarExpr):
if isinstance(result, dd.core.Scalar):
# wrap the scalar in a series
out_dtype = _pandas_dtype_from_dd_scalar(result)
Expand Down

1 comment on commit e929f85

@ibis-squawk-bot
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 3.

Benchmark suite Current: e929f85 Previous: 54cc3c3 Ratio
ibis/tests/benchmarks/test_benchmarks.py::test_execute[high_card_grouped_rolling] 0.5403445649272071 iter/sec (stddev: 0.02177593805725441) 1.621099149077072 iter/sec (stddev: 0.0020812897540296196) 3.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.