Skip to content

Commit

Permalink
refactor: move coercion functions from schema.py to udf
Browse files Browse the repository at this point in the history
fixes #3033

BREAKING CHANGE: coercion functions previously in expr/schema.py are now in udf/vectorized.py
  • Loading branch information
EdAbati authored and cpcloud committed Feb 7, 2022
1 parent 77859c2 commit 58eea56
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 160 deletions.
2 changes: 1 addition & 1 deletion ibis/backends/pandas/execution/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
import ibis.util
from ibis.expr import operations as ops
from ibis.expr import types as ir
from ibis.expr.schema import coerce_to_dataframe
from ibis.expr.scope import Scope
from ibis.udf.vectorized import coerce_to_dataframe

from ..core import execute
from ..execution import constants
Expand Down
153 changes: 0 additions & 153 deletions ibis/expr/schema.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import collections
from typing import Any, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from multipledispatch import Dispatcher

import ibis.common.exceptions as com
Expand Down Expand Up @@ -240,153 +237,3 @@ def schema_from_pairs(lst):
@schema.register(collections.abc.Iterable, collections.abc.Iterable)
def schema_from_names_types(names, types):
return Schema(names, types)


def coerce_to_tuple(
data: Union[List, np.ndarray, pd.Series],
output_type: dt.Struct,
index: Optional[pd.Index] = None,
) -> Tuple:
"""Coerce the following shapes to a tuple.
(1) A list
(2) An np.ndarray
(3) A Series
"""
return tuple(data)


def coerce_to_np_array(
data: Union[List, np.ndarray, pd.Series],
output_type: dt.Struct,
index: Optional[pd.Index] = None,
) -> np.ndarray:
"""Coerce the following shapes to an np.ndarray.
(1) A list
(2) An np.ndarray
(3) A Series
"""
return np.array(data)


def coerce_to_series(
data: Union[List, np.ndarray, pd.Series],
output_type: dt.DataType,
original_index: Optional[pd.Index] = None,
) -> pd.Series:
"""Coerce the following shapes to a Series.
(1) A list
(2) An np.ndarray
(3) A Series
Note:
This method does NOT always return a new Series. If a Series is
passed in, this method will return the original object.
Parameters
----------
data : pd.Series, a list, or an np.array
output_type : the type of the output
original_index : Optional parameter containing the index of the output
Returns
-------
pd.Series
"""
if isinstance(data, (list, np.ndarray)):
result = pd.Series(data)
elif isinstance(data, pd.Series):
result = data
else:
# This case is a non-vector elementwise or analytic UDF that should
# not be coerced to a Series.
return data
if original_index is not None:
result.index = original_index
return result


def coerce_to_dataframe(
data: Any,
output_type: dt.Struct,
original_index: Optional[pd.Index] = None,
) -> pd.DataFrame:
"""Coerce the following shapes to a DataFrame.
The following shapes are allowed:
(1) A list/tuple of Series
(2) A list/tuple np.ndarray
(3) A list/tuple of scalars
(4) A Series of list/tuple
(5) pd.DataFrame
Note:
This method does NOT always return a new DataFrame. If a DataFrame is
passed in, this method will return the original object.
Parameters
----------
data : pd.DataFrame, a tuple/list of pd.Series, a tuple/list of np.array
or a pd.Series of tuple/list
output_type : a Struct containing the names and types of the output
original_index : Optional parameter containing the index of the output
Returns
-------
pd.DataFrame
Examples
--------
>>> coerce_to_dataframe(pd.DataFrame({'a': [1, 2, 3]}), dt.Struct([('b', 'int32')])) # noqa: E501
b
0 1
1 2
2 3
dtype: int32
>>> coerce_to_dataframe(pd.Series([[1, 2, 3]]), dt.Struct([('a', 'int32'), ('b', 'int32'), ('c', 'int32')])) # noqa: E501
a b c
0 1 2 3
dtypes: [int32, int32, int32]
>>> coerce_to_dataframe(pd.Series([range(3), range(3)]), dt.Struct([('a', 'int32'), ('b', 'int32'), ('c', 'int32')])) # noqa: E501
a b c
0 0 1 2
1 0 1 2
dtypes: [int32, int32, int32]
>>> coerce_to_dataframe([pd.Series(x) for x in [1, 2, 3]], dt.Struct([('a', 'int32'), ('b', 'int32'), ('c', 'int32')])) # noqa: E501
a b c
0 1 2 3
>>> coerce_to_dataframe([1, 2, 3], dt.Struct([('a', 'int32'), ('b', 'int32'), ('c', 'int32')])) # noqa: E501
a b c
0 1 2 3
dtypes: [int32, int32, int32]
"""
if isinstance(data, pd.DataFrame):
result = data
elif isinstance(data, pd.Series):
if not len(data):
result = data.to_frame()
else:
num_cols = len(data.iloc[0])
series = [data.apply(lambda t: t[i]) for i in range(num_cols)]
result = pd.concat(series, axis=1)
elif isinstance(data, (tuple, list, np.ndarray)):
if isinstance(data[0], pd.Series):
result = pd.concat(data, axis=1)
elif isinstance(data[0], np.ndarray):
result = pd.concat([pd.Series(v) for v in data], axis=1)
else:
# Promote scalar to Series
result = pd.concat([pd.Series([v]) for v in data], axis=1)
else:
raise ValueError(f"Cannot coerce to DataFrame: {data}")

result.columns = output_type.names
if original_index is not None:
result.index = original_index
return result
160 changes: 154 additions & 6 deletions ibis/udf/vectorized.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
"""

import functools
from typing import Any, List, Optional, Tuple, Union

import numpy as np
import pandas as pd

import ibis.expr.datatypes as dt
import ibis.udf.validate as v
Expand All @@ -14,12 +18,156 @@
ElementWiseVectorizedUDF,
ReductionVectorizedUDF,
)
from ibis.expr.schema import (
coerce_to_dataframe,
coerce_to_np_array,
coerce_to_series,
coerce_to_tuple,
)


def coerce_to_tuple(
data: Union[List, np.ndarray, pd.Series],
output_type: dt.Struct,
index: Optional[pd.Index] = None,
) -> Tuple:
"""Coerce the following shapes to a tuple.
(1) A list
(2) An np.ndarray
(3) A Series
"""
return tuple(data)


def coerce_to_np_array(
data: Union[List, np.ndarray, pd.Series],
output_type: dt.Struct,
index: Optional[pd.Index] = None,
) -> np.ndarray:
"""Coerce the following shapes to an np.ndarray.
(1) A list
(2) An np.ndarray
(3) A Series
"""
return np.array(data)


def coerce_to_series(
data: Union[List, np.ndarray, pd.Series],
output_type: dt.DataType,
original_index: Optional[pd.Index] = None,
) -> pd.Series:
"""Coerce the following shapes to a Series.
(1) A list
(2) An np.ndarray
(3) A Series
Note:
This method does NOT always return a new Series. If a Series is
passed in, this method will return the original object.
Parameters
----------
data : pd.Series, a list, or an np.array
output_type : the type of the output
original_index : Optional parameter containing the index of the output
Returns
-------
pd.Series
"""
if isinstance(data, (list, np.ndarray)):
result = pd.Series(data)
elif isinstance(data, pd.Series):
result = data
else:
# This case is a non-vector elementwise or analytic UDF that should
# not be coerced to a Series.
return data
if original_index is not None:
result.index = original_index
return result


def coerce_to_dataframe(
data: Any,
output_type: dt.Struct,
original_index: Optional[pd.Index] = None,
) -> pd.DataFrame:
"""Coerce the following shapes to a DataFrame.
The following shapes are allowed:
(1) A list/tuple of Series
(2) A list/tuple np.ndarray
(3) A list/tuple of scalars
(4) A Series of list/tuple
(5) pd.DataFrame
Note:
This method does NOT always return a new DataFrame. If a DataFrame is
passed in, this method will return the original object.
Parameters
----------
data : pd.DataFrame, a tuple/list of pd.Series, a tuple/list of np.array
or a pd.Series of tuple/list
output_type : a Struct containing the names and types of the output
original_index : Optional parameter containing the index of the output
Returns
-------
pd.DataFrame
Examples
--------
>>> coerce_to_dataframe(pd.DataFrame({'a': [1, 2, 3]}), dt.Struct([('b', 'int32')])) # noqa: E501
b
0 1
1 2
2 3
dtype: int32
>>> coerce_to_dataframe(pd.Series([[1, 2, 3]]), dt.Struct([('a', 'int32'), ('b', 'int32'), ('c', 'int32')])) # noqa: E501
a b c
0 1 2 3
dtypes: [int32, int32, int32]
>>> coerce_to_dataframe(pd.Series([range(3), range(3)]), dt.Struct([('a', 'int32'), ('b', 'int32'), ('c', 'int32')])) # noqa: E501
a b c
0 0 1 2
1 0 1 2
dtypes: [int32, int32, int32]
>>> coerce_to_dataframe([pd.Series(x) for x in [1, 2, 3]], dt.Struct([('a', 'int32'), ('b', 'int32'), ('c', 'int32')])) # noqa: E501
a b c
0 1 2 3
>>> coerce_to_dataframe([1, 2, 3], dt.Struct([('a', 'int32'), ('b', 'int32'), ('c', 'int32')])) # noqa: E501
a b c
0 1 2 3
dtypes: [int32, int32, int32]
"""
if isinstance(data, pd.DataFrame):
result = data
elif isinstance(data, pd.Series):
if not len(data):
result = data.to_frame()
else:
num_cols = len(data.iloc[0])
series = [data.apply(lambda t: t[i]) for i in range(num_cols)]
result = pd.concat(series, axis=1)
elif isinstance(data, (tuple, list, np.ndarray)):
if isinstance(data[0], pd.Series):
result = pd.concat(data, axis=1)
elif isinstance(data[0], np.ndarray):
result = pd.concat([pd.Series(v) for v in data], axis=1)
else:
# Promote scalar to Series
result = pd.concat([pd.Series([v]) for v in data], axis=1)
else:
raise ValueError(f"Cannot coerce to DataFrame: {data}")

result.columns = output_type.names
if original_index is not None:
result.index = original_index
return result


class UserDefinedFunction:
Expand Down

0 comments on commit 58eea56

Please sign in to comment.