170 changes: 86 additions & 84 deletions ibis/backends/pandas/tests/test_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,52 +54,46 @@ def t2(con):
return con.table("df2")


@udf.elementwise(input_type=["string"], output_type="int64")
def my_string_length(series, **kwargs):
return series.str.len() * 2
with pytest.warns(FutureWarning, match="v9.0"):

@udf.elementwise(input_type=["string"], output_type="int64")
def my_string_length(series, **kwargs):
return series.str.len() * 2

@udf.elementwise(input_type=[dt.double, dt.double], output_type=dt.double)
def my_add(series1, series2, **kwargs):
return series1 + series2
@udf.elementwise(input_type=[dt.double, dt.double], output_type=dt.double)
def my_add(series1, series2, **kwargs):
return series1 + series2

@udf.reduction(["double"], "double")
def my_mean(series):
return series.mean()

@udf.reduction(["double"], "double")
def my_mean(series):
return series.mean()
@udf.reduction(input_type=[dt.string], output_type=dt.int64)
def my_string_length_sum(series, **kwargs):
return (series.str.len() * 2).sum()

@udf.reduction(input_type=[dt.double, dt.double], output_type=dt.double)
def my_corr(lhs, rhs, **kwargs):
return lhs.corr(rhs)

@udf.reduction(input_type=[dt.string], output_type=dt.int64)
def my_string_length_sum(series, **kwargs):
return (series.str.len() * 2).sum()
@udf.elementwise([dt.double], dt.double)
def add_one(x):
return x + 1.0

@udf.elementwise([dt.double], dt.double)
def times_two(x):
return x * 2.0

@udf.reduction(input_type=[dt.double, dt.double], output_type=dt.double)
def my_corr(lhs, rhs, **kwargs):
return lhs.corr(rhs)


@udf.elementwise([dt.double], dt.double)
def add_one(x):
return x + 1.0


@udf.elementwise([dt.double], dt.double)
def times_two(x):
return x * 2.0


@udf.analytic(input_type=["double"], output_type="double")
def zscore(series):
return (series - series.mean()) / series.std()

@udf.analytic(input_type=["double"], output_type="double")
def zscore(series):
return (series - series.mean()) / series.std()

@udf.reduction(
input_type=[dt.double],
output_type=dt.Array(dt.double),
)
def quantiles(series, *, quantiles):
return np.array(series.quantile(quantiles))
@udf.reduction(
input_type=[dt.double],
output_type=dt.Array(dt.double),
)
def quantiles(series, *, quantiles):
return np.array(series.quantile(quantiles))


def test_udf(t, df):
Expand Down Expand Up @@ -211,37 +205,42 @@ def test_udaf_groupby():

def test_udaf_parameter_mismatch():
with pytest.raises(TypeError):
with pytest.warns(FutureWarning, match="v9.0"):

@udf.reduction(input_type=[dt.double], output_type=dt.double)
def my_corr(lhs, rhs, **kwargs):
pass
@udf.reduction(input_type=[dt.double], output_type=dt.double)
def my_corr(lhs, rhs, **kwargs):
pass


def test_udf_parameter_mismatch():
with pytest.raises(TypeError):
with pytest.warns(FutureWarning, match="v9.0"):

@udf.reduction(input_type=[], output_type=dt.double)
def my_corr2(lhs, **kwargs):
pass
@udf.reduction(input_type=[], output_type=dt.double)
def my_corr2(lhs, **kwargs):
pass


def test_udf_error(t):
@udf.elementwise(input_type=[dt.double], output_type=dt.double)
def error_udf(s):
raise ValueError("xxx")
with pytest.warns(FutureWarning, match="v9.0"):

@udf.elementwise(input_type=[dt.double], output_type=dt.double)
def error_udf(s):
raise ValueError("xxx")

with pytest.raises(ValueError):
error_udf(t.c).execute()


def test_udf_no_reexecution(t2):
execution_count = 0
with pytest.warns(FutureWarning, match="v9.0"):

@udf.elementwise(input_type=[dt.double], output_type=dt.double)
def times_two_count_executions(x):
nonlocal execution_count
execution_count += 1
return x * 2.0
@udf.elementwise(input_type=[dt.double], output_type=dt.double)
def times_two_count_executions(x):
nonlocal execution_count
execution_count += 1
return x * 2.0

expr = t2.mutate(doubled=times_two_count_executions(t2.a))
expr.execute()
Expand Down Expand Up @@ -313,10 +312,11 @@ def test_udaf_window_interval():

def test_multiple_argument_udaf_window():
# PR 2035
with pytest.warns(FutureWarning, match="v9.0"):

@udf.reduction(["double", "double"], "double")
def my_wm(v, w):
return np.average(v, weights=w)
@udf.reduction(["double", "double"], "double")
def my_wm(v, w):
return np.average(v, weights=w)

df = pd.DataFrame(
{
Expand Down Expand Up @@ -427,36 +427,38 @@ def test_array_return_type_reduction_group_by(con, t, df, qs):


def test_elementwise_udf_with_many_args(t2):
@udf.elementwise(
input_type=[dt.double] * 16 + [dt.int32] * 8, output_type=dt.double
)
def my_udf(
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
c10,
c11,
c12,
c13,
c14,
c15,
c16,
c17,
c18,
c19,
c20,
c21,
c22,
c23,
c24,
):
return c1
with pytest.warns(FutureWarning, match="v9.0"):

@udf.elementwise(
input_type=[dt.double] * 16 + [dt.int32] * 8, output_type=dt.double
)
def my_udf(
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
c10,
c11,
c12,
c13,
c14,
c15,
c16,
c17,
c18,
c19,
c20,
c21,
c22,
c23,
c24,
):
return c1

expr = my_udf(*([t2.a] * 8 + [t2.b] * 8 + [t2.c] * 8))
result = expr.execute()
Expand Down
35 changes: 20 additions & 15 deletions ibis/backends/pandas/tests/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,14 +498,15 @@ def test_window_on_and_by_key_as_window_input(t, df):
)

# Test UDF
with pytest.warns(FutureWarning, match="v9.0"):

@reduction(input_type=[dt.int64], output_type=dt.int64)
def count(v):
return len(v)
@reduction(input_type=[dt.int64], output_type=dt.int64)
def count(v):
return len(v)

@reduction(input_type=[dt.int64, dt.int64], output_type=dt.int64)
def count_both(v1, v2):
return len(v1)
@reduction(input_type=[dt.int64, dt.int64], output_type=dt.int64)
def count_both(v1, v2):
return len(v1)

tm.assert_series_equal(
count(t[order_by]).over(row_window).execute(),
Expand Down Expand Up @@ -545,17 +546,21 @@ def test_rolling_window_udf_nan_and_non_numeric(t, group_by, order_by):
t = t.mutate(nan_int64=t["plain_int64"])
t = t.mutate(nan_int64=None)

@reduction(input_type=[dt.int64], output_type=dt.int64)
def count_int64(v):
return len(v)
with pytest.warns(FutureWarning, match="v9.0"):

@reduction(input_type=[dt.timestamp], output_type=dt.int64)
def count_timestamp(v):
return len(v)
@reduction(input_type=[dt.int64], output_type=dt.int64)
def count_int64(v):
return len(v)

@reduction(input_type=[t["map_of_strings_integers"].type()], output_type=dt.int64)
def count_complex(v):
return len(v)
@reduction(input_type=[dt.timestamp], output_type=dt.int64)
def count_timestamp(v):
return len(v)

@reduction(
input_type=[t["map_of_strings_integers"].type()], output_type=dt.int64
)
def count_complex(v):
return len(v)

window = ibis.trailing_window(preceding=1, order_by=order_by, group_by=group_by)

Expand Down
25 changes: 12 additions & 13 deletions ibis/backends/polars/tests/test_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,21 @@
pytest.importorskip("polars")
pc = pytest.importorskip("pyarrow.compute")

with pytest.warns(FutureWarning, match="v9.0"):

@elementwise(input_type=["string"], output_type="int64")
def my_string_length(arr, **kwargs):
return pl.from_arrow(
pc.cast(pc.multiply(pc.utf8_length(arr.to_arrow()), 2), target_type="int64")
)
@elementwise(input_type=["string"], output_type="int64")
def my_string_length(arr, **kwargs):
return pl.from_arrow(
pc.cast(pc.multiply(pc.utf8_length(arr.to_arrow()), 2), target_type="int64")
)

@elementwise(input_type=[dt.int64, dt.int64], output_type=dt.int64)
def my_add(arr1, arr2, **kwargs):
return pl.from_arrow(pc.add(arr1.to_arrow(), arr2.to_arrow()))

@elementwise(input_type=[dt.int64, dt.int64], output_type=dt.int64)
def my_add(arr1, arr2, **kwargs):
return pl.from_arrow(pc.add(arr1.to_arrow(), arr2.to_arrow()))


@reduction(input_type=[dt.float64], output_type=dt.float64)
def my_mean(arr):
return pc.mean(arr)
@reduction(input_type=[dt.float64], output_type=dt.float64)
def my_mean(arr):
return pc.mean(arr)


def test_udf(alltypes):
Expand Down
37 changes: 20 additions & 17 deletions ibis/backends/tests/test_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@
)
from ibis.legacy.udf.vectorized import reduction

with pytest.warns(FutureWarning, match="v9.0"):

@reduction(input_type=[dt.double], output_type=dt.double)
def mean_udf(s):
return s.mean()
@reduction(input_type=[dt.double], output_type=dt.double)
def mean_udf(s):
return s.mean()


aggregate_test_params = [
Expand Down Expand Up @@ -232,13 +233,14 @@ def test_aggregate_grouped(backend, alltypes, df, result_fn, expected_fn):
def test_aggregate_multikey_group_reduction_udf(backend, alltypes, df):
"""Tests .aggregate() on a multi-key group_by with a reduction
operation."""
with pytest.warns(FutureWarning, match="v9.0"):

@reduction(
input_type=[dt.double],
output_type=dt.Struct({"mean": dt.double, "std": dt.double}),
)
def mean_and_std(v):
return v.mean(), v.std()
@reduction(
input_type=[dt.double],
output_type=dt.Struct({"mean": dt.double, "std": dt.double}),
)
def mean_and_std(v):
return v.mean(), v.std()

grouping_key_cols = ["bigint_col", "int_col"]

Expand Down Expand Up @@ -1427,8 +1429,8 @@ def test_aggregate_list_like(backend, alltypes, df, agg_fn):
words, the resulting table expression should have one element, which
is the list / np.array).
"""

udf = reduction(input_type=[dt.double], output_type=dt.Array(dt.double))(agg_fn)
with pytest.warns(FutureWarning, match="v9.0"):
udf = reduction(input_type=[dt.double], output_type=dt.Array(dt.double))(agg_fn)

expr = alltypes.aggregate(result_col=udf(alltypes.double_col))
result = expr.execute()
Expand Down Expand Up @@ -1468,14 +1470,15 @@ def test_aggregate_mixed_udf(backend, alltypes, df):
(In particular, one aggregation that results in an array, and other
aggregation(s) that result in a non-array)
"""
with pytest.warns(FutureWarning, match="v9.0"):

@reduction(input_type=[dt.double], output_type=dt.double)
def sum_udf(v):
return np.sum(v)
@reduction(input_type=[dt.double], output_type=dt.double)
def sum_udf(v):
return np.sum(v)

@reduction(input_type=[dt.double], output_type=dt.Array(dt.double))
def collect_udf(v):
return np.array(v)
@reduction(input_type=[dt.double], output_type=dt.Array(dt.double))
def collect_udf(v):
return np.array(v)

expr = alltypes.aggregate(
sum_col=sum_udf(alltypes.double_col),
Expand Down
240 changes: 128 additions & 112 deletions ibis/backends/tests/test_vectorized_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,11 @@ def add_one(s):


def create_add_one_udf(result_formatter, id):
@elementwise(input_type=[dt.double], output_type=dt.double)
def add_one_legacy(s):
return result_formatter(add_one(s))
with pytest.warns(FutureWarning, match="v9.0"):

@elementwise(input_type=[dt.double], output_type=dt.double)
def add_one_legacy(s):
return result_formatter(add_one(s))

@ibis.udf.scalar.pandas
def add_one_udf(s: float) -> float:
Expand Down Expand Up @@ -73,9 +75,10 @@ def calc_zscore(s):


def create_calc_zscore_udf(result_formatter):
return analytic(input_type=[dt.double], output_type=dt.double)(
_format_udf_return_type(calc_zscore, result_formatter)
)
with pytest.warns(FutureWarning, match="v9.0"):
return analytic(input_type=[dt.double], output_type=dt.double)(
_format_udf_return_type(calc_zscore, result_formatter)
)


calc_zscore_udfs = [
Expand All @@ -84,11 +87,12 @@ def create_calc_zscore_udf(result_formatter):
create_calc_zscore_udf(result_formatter=lambda v: list(v)), # list,
]

with pytest.warns(FutureWarning, match="v9.0"):

@reduction(input_type=[dt.double], output_type=dt.double)
def calc_mean(s):
assert isinstance(s, (np.ndarray, pd.Series))
return s.mean()
@reduction(input_type=[dt.double], output_type=dt.double)
def calc_mean(s):
assert isinstance(s, (np.ndarray, pd.Series))
return s.mean()


# elementwise multi-column UDF
Expand All @@ -98,10 +102,11 @@ def add_one_struct(v):


def create_add_one_struct_udf(result_formatter):
return elementwise(
input_type=[dt.double],
output_type=dt.Struct({"col1": dt.double, "col2": dt.double}),
)(_format_struct_udf_return_type(add_one_struct, result_formatter))
with pytest.warns(FutureWarning, match="v9.0"):
return elementwise(
input_type=[dt.double],
output_type=dt.Struct({"col1": dt.double, "col2": dt.double}),
)(_format_struct_udf_return_type(add_one_struct, result_formatter))


add_one_struct_udfs = [
Expand Down Expand Up @@ -139,35 +144,37 @@ def create_add_one_struct_udf(result_formatter):
),
]

with pytest.warns(FutureWarning, match="v9.0"):

@elementwise(
input_type=[dt.double],
output_type=dt.Struct({"double_col": dt.double, "col2": dt.double}),
)
def overwrite_struct_elementwise(v):
assert isinstance(v, pd.Series)
return v + 1, v + 2
@elementwise(
input_type=[dt.double],
output_type=dt.Struct({"double_col": dt.double, "col2": dt.double}),
)
def overwrite_struct_elementwise(v):
assert isinstance(v, pd.Series)
return v + 1, v + 2

@elementwise(
input_type=[dt.double],
output_type=dt.Struct(
{"double_col": dt.double, "col2": dt.double, "float_col": dt.double}
),
)
def multiple_overwrite_struct_elementwise(v):
assert isinstance(v, pd.Series)
return v + 1, v + 2, v + 3

@elementwise(
input_type=[dt.double],
output_type=dt.Struct(
{"double_col": dt.double, "col2": dt.double, "float_col": dt.double}
),
)
def multiple_overwrite_struct_elementwise(v):
assert isinstance(v, pd.Series)
return v + 1, v + 2, v + 3

with pytest.warns(FutureWarning, match="v9.0"):

@analytic(
input_type=[dt.double, dt.double],
output_type=dt.Struct({"double_col": dt.double, "demean_weight": dt.double}),
)
def overwrite_struct_analytic(v, w):
assert isinstance(v, pd.Series)
assert isinstance(w, pd.Series)
return v - v.mean(), w - w.mean()
@analytic(
input_type=[dt.double, dt.double],
output_type=dt.Struct({"double_col": dt.double, "demean_weight": dt.double}),
)
def overwrite_struct_analytic(v, w):
assert isinstance(v, pd.Series)
assert isinstance(w, pd.Series)
return v - v.mean(), w - w.mean()


# analytic multi-column UDF
Expand All @@ -178,10 +185,11 @@ def demean_struct(v, w):


def create_demean_struct_udf(result_formatter):
return analytic(
input_type=[dt.double, dt.double],
output_type=dt.Struct({"demean": dt.double, "demean_weight": dt.double}),
)(_format_struct_udf_return_type(demean_struct, result_formatter))
with pytest.warns(FutureWarning, match="v9.0"):
return analytic(
input_type=[dt.double, dt.double],
output_type=dt.Struct({"demean": dt.double, "demean_weight": dt.double}),
)(_format_struct_udf_return_type(demean_struct, result_formatter))


demean_struct_udfs = [
Expand Down Expand Up @@ -216,10 +224,11 @@ def mean_struct(v, w):


def create_mean_struct_udf(result_formatter):
return reduction(
input_type=[dt.double, dt.int64],
output_type=dt.Struct({"mean": dt.double, "mean_weight": dt.double}),
)(_format_struct_udf_return_type(mean_struct, result_formatter))
with pytest.warns(FutureWarning, match="v9.0"):
return reduction(
input_type=[dt.double, dt.int64],
output_type=dt.Struct({"mean": dt.double, "mean_weight": dt.double}),
)(_format_struct_udf_return_type(mean_struct, result_formatter))


mean_struct_udfs = [
Expand All @@ -232,23 +241,23 @@ def create_mean_struct_udf(result_formatter):
), # np.array of scalar
]

with pytest.warns(FutureWarning, match="v9.0"):

@reduction(
input_type=[dt.double, dt.int64],
output_type=dt.Struct({"double_col": dt.double, "mean_weight": dt.double}),
)
def overwrite_struct_reduction(v, w):
assert isinstance(v, (np.ndarray, pd.Series))
assert isinstance(w, (np.ndarray, pd.Series))
return v.mean(), w.mean()

@reduction(
input_type=[dt.double, dt.int64],
output_type=dt.Struct({"double_col": dt.double, "mean_weight": dt.double}),
)
def overwrite_struct_reduction(v, w):
assert isinstance(v, (np.ndarray, pd.Series))
assert isinstance(w, (np.ndarray, pd.Series))
return v.mean(), w.mean()

@reduction(
input_type=[dt.double],
output_type=dt.Array(dt.double),
)
def quantiles(series, *, quantiles):
return series.quantile(quantiles)
@reduction(
input_type=[dt.double],
output_type=dt.Array(dt.double),
)
def quantiles(series, *, quantiles):
return series.quantile(quantiles)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -344,29 +353,31 @@ def test_output_type_in_list_invalid():
com.IbisTypeError,
match="The output type of a UDF must be a single datatype.",
):
with pytest.warns(FutureWarning, match="v9.0"):

@elementwise(input_type=[dt.double], output_type=[dt.double])
def _(s):
return s + 1
@elementwise(input_type=[dt.double], output_type=[dt.double])
def _(s):
return s + 1


def test_valid_kwargs(udf_backend, udf_alltypes, udf_df):
# Test different forms of UDF definition with keyword arguments
with pytest.warns(FutureWarning, match="v9.0"):

@elementwise(input_type=[dt.double], output_type=dt.double)
def foo1(v):
# Basic UDF with kwargs
return v + 1
@elementwise(input_type=[dt.double], output_type=dt.double)
def foo1(v):
# Basic UDF with kwargs
return v + 1

@elementwise(input_type=[dt.double], output_type=dt.double)
def foo2(v, *, amount):
# UDF with keyword only arguments
return v + amount
@elementwise(input_type=[dt.double], output_type=dt.double)
def foo2(v, *, amount):
# UDF with keyword only arguments
return v + amount

@elementwise(input_type=[dt.double], output_type=dt.double)
def foo3(v, **kwargs):
# UDF with kwargs
return v + kwargs.get("amount", 1)
@elementwise(input_type=[dt.double], output_type=dt.double)
def foo3(v, **kwargs):
# UDF with kwargs
return v + kwargs.get("amount", 1)

expr = udf_alltypes.mutate(
v1=foo1(udf_alltypes["double_col"]),
Expand All @@ -392,14 +403,15 @@ def foo3(v, **kwargs):

def test_valid_args(udf_backend, udf_alltypes, udf_df):
# Test different forms of UDF definition with *args
with pytest.warns(FutureWarning, match="v9.0"):

@elementwise(input_type=[dt.double, dt.int32], output_type=dt.double)
def foo1(*args):
return args[0] + args[1]
@elementwise(input_type=[dt.double, dt.int32], output_type=dt.double)
def foo1(*args):
return args[0] + args[1]

@elementwise(input_type=[dt.double, dt.int32], output_type=dt.double)
def foo2(v, *args):
return v + args[0]
@elementwise(input_type=[dt.double, dt.int32], output_type=dt.double)
def foo2(v, *args):
return v + args[0]

result = udf_alltypes.mutate(
v1=foo1(udf_alltypes["double_col"], udf_alltypes["int_col"]),
Expand All @@ -416,27 +428,28 @@ def foo2(v, *args):

def test_valid_args_and_kwargs(udf_backend, udf_alltypes, udf_df):
# Test UDFs with both *args and keyword arguments
with pytest.warns(FutureWarning, match="v9.0"):

@elementwise(input_type=[dt.double, dt.int32], output_type=dt.double)
def foo1(*args, amount):
# UDF with *args and a keyword-only argument
return args[0] + args[1] + amount
@elementwise(input_type=[dt.double, dt.int32], output_type=dt.double)
def foo1(*args, amount):
# UDF with *args and a keyword-only argument
return args[0] + args[1] + amount

@elementwise(input_type=[dt.double, dt.int32], output_type=dt.double)
def foo2(*args, **kwargs):
# UDF with *args and **kwargs
return args[0] + args[1] + kwargs.get("amount", 1)
@elementwise(input_type=[dt.double, dt.int32], output_type=dt.double)
def foo2(*args, **kwargs):
# UDF with *args and **kwargs
return args[0] + args[1] + kwargs.get("amount", 1)

@elementwise(input_type=[dt.double, dt.int32], output_type=dt.double)
def foo3(v, *args, amount):
# UDF with an explicit positional argument, *args, and a keyword-only
# argument
return v + args[0] + amount
@elementwise(input_type=[dt.double, dt.int32], output_type=dt.double)
def foo3(v, *args, amount):
# UDF with an explicit positional argument, *args, and a keyword-only
# argument
return v + args[0] + amount

@elementwise(input_type=[dt.double, dt.int32], output_type=dt.double)
def foo4(v, *args, **kwargs):
# UDF with an explicit positional argument, *args, and **kwargs
return v + args[0] + kwargs.get("amount", 1)
@elementwise(input_type=[dt.double, dt.int32], output_type=dt.double)
def foo4(v, *args, **kwargs):
# UDF with an explicit positional argument, *args, and **kwargs
return v + args[0] + kwargs.get("amount", 1)

result = udf_alltypes.mutate(
v1=foo1(udf_alltypes["double_col"], udf_alltypes["int_col"], amount=2),
Expand All @@ -460,10 +473,11 @@ def test_invalid_kwargs():
# keyword argument raises an error

with pytest.raises(TypeError, match=".*must be defined as keyword only.*"):
with pytest.warns(FutureWarning, match="v9.0"):

@elementwise(input_type=[dt.double], output_type=dt.double)
def _(v, _):
return v + 1
@elementwise(input_type=[dt.double], output_type=dt.double)
def _(v, _):
return v + 1


@pytest.mark.parametrize("udf", add_one_struct_udfs)
Expand Down Expand Up @@ -526,16 +540,18 @@ def test_elementwise_udf_overwrite_destruct_and_assign(udf_backend, udf_alltypes
@pytest.mark.xfail_version(pyspark=["pyspark<3.1"])
@pytest.mark.parametrize("method", ["destructure", "unpack"])
def test_elementwise_udf_destructure_exact_once(udf_alltypes, method, tmp_path):
@elementwise(
input_type=[dt.double],
output_type=dt.Struct({"col1": dt.double, "col2": dt.double}),
)
def add_one_struct_exact_once(v):
key = v.iloc[0]
path = tmp_path / str(key)
assert not path.exists()
path.touch()
return v + 1, v + 2
with pytest.warns(FutureWarning, match="v9.0"):

@elementwise(
input_type=[dt.double],
output_type=dt.Struct({"col1": dt.double, "col2": dt.double}),
)
def add_one_struct_exact_once(v):
key = v.iloc[0]
path = tmp_path / str(key)
assert not path.exists()
path.touch()
return v + 1, v + 2

struct = add_one_struct_exact_once(udf_alltypes["id"])

Expand Down
13 changes: 7 additions & 6 deletions ibis/backends/tests/test_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,15 @@ def _ntile(x: pd.Series, bucket: int) -> pd.Series:
)


@reduction(input_type=[dt.double], output_type=dt.double)
def mean_udf(s):
return s.mean()
with pytest.warns(FutureWarning, match="v9.0"):

@reduction(input_type=[dt.double], output_type=dt.double)
def mean_udf(s):
return s.mean()

@analytic(input_type=[dt.double], output_type=dt.double)
def calc_zscore(s):
return (s - s.mean()) / s.std()
@analytic(input_type=[dt.double], output_type=dt.double)
def calc_zscore(s):
return (s - s.mean()) / s.std()


@pytest.mark.parametrize(
Expand Down
14 changes: 8 additions & 6 deletions ibis/expr/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,12 +337,14 @@ def test_two_inner_joins(snapshot):
def test_destruct_selection(snapshot):
table = ibis.table([("col", "int64")], name="t")

@udf.reduction(
input_type=["int64"],
output_type=dt.Struct({"sum": "int64", "mean": "float64"}),
)
def multi_output_udf(v):
return v.sum(), v.mean()
with pytest.warns(FutureWarning, match="v9.0"):

@udf.reduction(
input_type=["int64"],
output_type=dt.Struct({"sum": "int64", "mean": "float64"}),
)
def multi_output_udf(v):
return v.sum(), v.mean()

expr = table.aggregate(multi_output_udf(table["col"]).destructure())
result = fmt(expr)
Expand Down
83 changes: 4 additions & 79 deletions ibis/legacy/udf/vectorized.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
ElementWiseVectorizedUDF,
ReductionVectorizedUDF,
)
from ibis.util import deprecated

if TYPE_CHECKING:
import pandas as pd
Expand Down Expand Up @@ -263,6 +264,7 @@ def wrapper(func):
return wrapper


@deprecated(as_of="9.0", instead="")
def analytic(input_type, output_type):
"""Define an analytic UDF that produces the same of rows as the input.
Expand All @@ -275,36 +277,11 @@ def analytic(input_type, output_type):
output_type : ibis.expr.datatypes.DataType
The return type of the function.
Examples
--------
>>> import ibis
>>> import ibis.expr.datatypes as dt
>>> from ibis.legacy.udf.vectorized import analytic
>>> @analytic(input_type=[dt.double], output_type=dt.double)
... def zscore(series): # note the use of aggregate functions
... return (series - series.mean()) / series.std()
Define and use an UDF with multiple return columns:
>>> @analytic(
... input_type=[dt.double],
... output_type=dt.Struct(dict(demean="double", zscore="double")),
... )
... def demean_and_zscore(v):
... mean = v.mean()
... std = v.std()
... return v - mean, (v - mean) / std
>>>
>>> win = ibis.window(preceding=None, following=None, group_by="key")
>>> # add two columns "demean" and "zscore"
>>> table = table.mutate( # quartodoc: +SKIP # doctest: +SKIP
... demean_and_zscore(table["v"]).over(win).destructure()
... )
"""
return _udf_decorator(AnalyticVectorizedUDF, input_type, output_type)


@deprecated(as_of="9.0", instead="use the ibis.udf.* api")
def elementwise(input_type, output_type):
"""Define a UDF that operates element-wise on a Pandas Series.
Expand All @@ -317,39 +294,11 @@ def elementwise(input_type, output_type):
output_type : ibis.expr.datatypes.DataType
The return type of the function.
Examples
--------
>>> import ibis
>>> import ibis.expr.datatypes as dt
>>> from ibis.legacy.udf.vectorized import elementwise
>>> @elementwise(input_type=[dt.string], output_type=dt.int64)
... def my_string_length(series):
... return series.str.len() * 2
Define an UDF with non-column parameters:
>>> @elementwise(input_type=[dt.string], output_type=dt.int64)
... def my_string_length(series, *, times):
... return series.str.len() * times
Define and use an UDF with multiple return columns:
>>> @elementwise(
... input_type=[dt.string],
... output_type=dt.Struct(dict(year=dt.string, monthday=dt.string)),
... )
... def year_monthday(date):
... return date.str.slice(0, 4), date.str.slice(4, 8)
>>>
>>> # add two columns "year" and "monthday"
>>> table = table.mutate(
... year_monthday(table["date"]).destructure()
... ) # quartodoc: +SKIP # doctest: +SKIP
"""
return _udf_decorator(ElementWiseVectorizedUDF, input_type, output_type)


@deprecated(as_of="9.0", instead="use the @ibis.udf.agg.builtin decorator")
def reduction(input_type, output_type):
"""Define a UDF reduction function that produces 1 row of output for N rows of input.
Expand All @@ -361,29 +310,5 @@ def reduction(input_type, output_type):
function. Variadic arguments are not yet supported.
output_type : ibis.expr.datatypes.DataType
The return type of the function.
Examples
--------
>>> import ibis
>>> import ibis.expr.datatypes as dt
>>> from ibis.legacy.udf.vectorized import reduction
>>> @reduction(input_type=[dt.string], output_type=dt.int64)
... def my_string_length_agg(series, **kwargs):
... return (series.str.len() * 2).sum()
Define and use an UDF with multiple return columns:
>>> @reduction(
... input_type=[dt.double],
... output_type=dt.Struct(dict(mean="double", std="double")),
... )
... def mean_and_std(v):
... return v.mean(), v.std()
>>>
>>> # create aggregation columns "mean" and "std"
>>> table = table.group_by("key").aggregate( # quartodoc: +SKIP # doctest: +SKIP
... mean_and_std(table["v"]).destructure()
... )
"""
return _udf_decorator(ReductionVectorizedUDF, input_type, output_type)