Skip to content

Commit

Permalink
refactor: use lazy dispatch for dt.infer
Browse files Browse the repository at this point in the history
  • Loading branch information
jcrist committed Nov 29, 2022
1 parent 180ecff commit 2e56540
Show file tree
Hide file tree
Showing 11 changed files with 183 additions and 334 deletions.
22 changes: 5 additions & 17 deletions ibis/backends/dask/client.py
Expand Up @@ -2,7 +2,6 @@

import dask.dataframe as dd
import numpy as np
import pandas as pd
from pandas.api.types import DatetimeTZDtype

import ibis.expr.datatypes as dt
Expand All @@ -12,16 +11,10 @@
from ibis.backends.pandas.client import (
PANDAS_DATE_TYPES,
PANDAS_STRING_TYPES,
_inferable_pandas_dtypes,
ibis_dtype_to_pandas,
ibis_schema_to_pandas,
)

infer_dask_dtype = pd.api.types.infer_dtype


_inferable_dask_dtypes = _inferable_pandas_dtypes


@sch.schema.register(dd.Series)
def schema_from_series(s):
Expand All @@ -40,16 +33,11 @@ def infer_dask_schema(df, schema=None):
if column_name in schema:
ibis_dtype = dt.dtype(schema[column_name])
elif dask_dtype == np.object_:
inferred_dtype = infer_dask_dtype(df[column_name].compute(), skipna=True)
if inferred_dtype in {'mixed', 'decimal'}:
# TODO: in principal we can handle decimal (added in pandas
# 0.23)
raise TypeError(
'Unable to infer type of column {0!r}. Try instantiating '
'your table from the client with client.table('
"'my_table', schema={{{0!r}: <explicit type>}})".format(column_name)
)
ibis_dtype = _inferable_dask_dtypes[inferred_dtype]
# TODO: don't call compute here. ibis should just assume that
# object dtypes are strings, which is what dask does. The user
# can always explicitly pass in `schema=...` when creating a
# table if they want to use a different dtype.
ibis_dtype = dt.infer(df[column_name].compute()).value_type
else:
ibis_dtype = dt.dtype(dask_dtype)

Expand Down
5 changes: 0 additions & 5 deletions ibis/backends/dask/tests/execution/test_operations.py
Expand Up @@ -28,11 +28,6 @@ def test_literal(client):
assert client.execute(ibis.literal(1)) == 1


def test_read_with_undiscoverable_type(client):
with pytest.raises(TypeError):
client.table('df')


def test_selection(t, df):
expr = t[((t.plain_strings == 'a') | (t.plain_int64 == 3)) & (t.dup_strings == 'd')]
result = expr.compile()
Expand Down
5 changes: 0 additions & 5 deletions ibis/backends/dask/tests/test_client.py
Expand Up @@ -72,11 +72,6 @@ def test_list_tables(client):
assert client.list_tables()


def test_read_with_undiscoverable_type(client):
with pytest.raises(TypeError):
client.table('df_unknown')


def test_drop(table):
table = table.mutate(c=table.a)
expr = table.drop('a')
Expand Down
4 changes: 0 additions & 4 deletions ibis/backends/dask/tests/test_core.py
Expand Up @@ -161,10 +161,6 @@ def is_computable_input_my_object(_):
execute_node.reorder()
execute_node._cache.clear()

del dt.infer.funcs[(MyObject,)]
dt.infer.reorder()
dt.infer._cache.clear()


def test_scope_look_up():
# test if scope could lookup items properly
Expand Down
5 changes: 0 additions & 5 deletions ibis/backends/dask/tests/test_datatypes.py
@@ -1,7 +1,6 @@
import numpy as np
import pandas as pd
import pytest
from multipledispatch.conflict import ambiguities
from pandas.api.types import CategoricalDtype, DatetimeTZDtype

import ibis
Expand All @@ -11,10 +10,6 @@
dd = pytest.importorskip("dask.dataframe")


def test_no_infer_ambiguities():
assert not ambiguities(dt.infer.funcs)


@pytest.mark.parametrize(
('value', 'expected_dtype'),
[
Expand Down
143 changes: 1 addition & 142 deletions ibis/backends/pandas/client.py
@@ -1,7 +1,6 @@
"""The pandas client implementation."""

import json
from collections.abc import Mapping, Sequence

import numpy as np
import pandas as pd
Expand All @@ -16,9 +15,6 @@
from ibis.backends.base import Database
from ibis.common.grounds import Immutable

infer_pandas_dtype = pd.api.types.infer_dtype


_ibis_dtypes = toolz.valmap(
np.dtype,
{
Expand Down Expand Up @@ -46,64 +42,6 @@
)


_numpy_dtypes = toolz.keymap(
np.dtype,
{
'bool': dt.boolean,
'int8': dt.int8,
'int16': dt.int16,
'int32': dt.int32,
'int64': dt.int64,
'uint8': dt.uint8,
'uint16': dt.uint16,
'uint32': dt.uint32,
'uint64': dt.uint64,
'float16': dt.float16,
'float32': dt.float32,
'float64': dt.float64,
'double': dt.double,
'unicode': dt.string,
'str': dt.string,
'datetime64': dt.timestamp,
'datetime64[ns]': dt.timestamp,
'timedelta64': dt.interval,
'timedelta64[ns]': dt.Interval('ns'),
},
)


_inferable_pandas_dtypes = {
'string': dt.string,
'bytes': dt.string,
'floating': dt.float64,
'integer': dt.int64,
'mixed-integer': dt.binary,
'mixed-integer-float': dt.float64,
'decimal': dt.float64,
'complex': dt.binary,
'categorical': dt.category,
'boolean': dt.boolean,
'datetime64': dt.timestamp,
'datetime': dt.timestamp,
'date': dt.date,
'timedelta64': dt.interval,
'timedelta': dt.interval,
'time': dt.time,
'period': dt.binary,
'mixed': dt.binary,
'empty': dt.binary,
'unicode': dt.string,
}


@dt.dtype.register(np.dtype)
def from_numpy_dtype(value):
try:
return _numpy_dtypes[value]
except KeyError:
raise TypeError(f'numpy dtype {value!r} is not supported in the pandas backend')


@dt.dtype.register(DatetimeTZDtype)
def from_pandas_tzdtype(value):
return dt.Timestamp(timezone=str(value.tz))
Expand All @@ -119,85 +57,6 @@ def from_pandas_string(_):
return dt.String()


@dt.infer.register(np.generic)
def infer_numpy_scalar(value):
return dt.dtype(value.dtype)


def _infer_pandas_series_contents(s: pd.Series) -> dt.DataType:
"""Infer the type of the **contents** of a pd.Series.
No dispatch for this because there is no class representing "the contents
of a Series". Instead, this is meant to be used internally, mainly by
`infer_pandas_series`.
Parameters
----------
s : pd.Series
The Series whose contents we want to know the type of
Returns
-------
dtype : dt.DataType
The dtype of the contents of the Series
"""
if s.dtype == np.object_:
inferred_dtype = infer_pandas_dtype(s, skipna=True)
if inferred_dtype == 'mixed':
# We need to inspect an element to determine the Ibis dtype
value = s.iloc[0]
if isinstance(value, (np.ndarray, pd.Series, Sequence, Mapping)):
# Defer to individual `infer` functions for these
return dt.infer(value)
else:
return dt.dtype('binary')
else:
return _inferable_pandas_dtypes[inferred_dtype]
else:
return dt.dtype(s.dtype)


@dt.infer.register(pd.Series)
def infer_pandas_series(s):
"""Infer the type of a pd.Series.
Note that the returned datatype will be an array type, which
corresponds to the fact that a Series is a collection of elements.
Please use `_infer_pandas_series_contents` if you are interested in
the datatype of the **contents** of the Series.
"""
return dt.Array(_infer_pandas_series_contents(s))


@dt.infer.register(pd.Timestamp)
def infer_pandas_timestamp(value):
if value.tz is not None:
return dt.Timestamp(timezone=str(value.tz))
else:
return dt.timestamp


@dt.infer.register(np.ndarray)
def infer_array(value):
# In this function, by default we'll directly map the dtype of the
# np.array to a corresponding Ibis dtype (see bottom)
np_dtype = value.dtype

# However, there are some special cases where we can't use the np.array's
# dtype:
if np_dtype.type == np.object_:
# np.array dtype is `dtype('O')`, which is ambiguous.
inferred_dtype = infer_pandas_dtype(value, skipna=True)
return dt.Array(_inferable_pandas_dtypes[inferred_dtype])
elif np_dtype.type == np.str_:
# np.array dtype is `dtype('<U1')` (for np.arrays containing strings),
# which is ambiguous.
return dt.Array(dt.string)

# The dtype of the np.array is not ambiguous, and can be used directly.
return dt.Array(dt.dtype(np_dtype))


@sch.schema.register(pd.Series)
def schema_from_series(s):
return sch.schema(tuple(s.items()))
Expand All @@ -215,7 +74,7 @@ def infer_pandas_schema(df, schema=None):
if column_name in schema:
ibis_dtype = dt.dtype(schema[column_name])
else:
ibis_dtype = _infer_pandas_series_contents(df[column_name])
ibis_dtype = dt.infer(df[column_name]).value_type

pairs.append((column_name, ibis_dtype))

Expand Down
4 changes: 0 additions & 4 deletions ibis/backends/pandas/tests/test_core.py
Expand Up @@ -175,10 +175,6 @@ def is_computable_input_my_object(_):
execute_node.reorder()
execute_node._cache.clear()

del dt.infer.funcs[(MyObject,)]
dt.infer.reorder()
dt.infer._cache.clear()


def test_scope_look_up():
# test if scope could lookup items properly
Expand Down
5 changes: 0 additions & 5 deletions ibis/backends/pandas/tests/test_datatypes.py
Expand Up @@ -4,18 +4,13 @@
import numpy as np
import pandas as pd
import pytest
from multipledispatch.conflict import ambiguities
from pandas.api.types import CategoricalDtype, DatetimeTZDtype

import ibis
import ibis.expr.datatypes as dt
import ibis.expr.schema as sch


def test_no_infer_ambiguities():
assert not ambiguities(dt.infer.funcs)


@pytest.mark.parametrize(
('value', 'expected_dtype'),
[
Expand Down
32 changes: 32 additions & 0 deletions ibis/expr/datatypes/core.py
Expand Up @@ -3,6 +3,7 @@
import numbers
from typing import Any, Iterable, Mapping, NamedTuple

import numpy as np
from multipledispatch import Dispatcher
from public import public

Expand Down Expand Up @@ -950,6 +951,37 @@ class INET(String):

Enum = String

_numpy_dtypes = {
np.dtype("bool"): boolean,
np.dtype("int8"): int8,
np.dtype("int16"): int16,
np.dtype("int32"): int32,
np.dtype("int64"): int64,
np.dtype("uint8"): uint8,
np.dtype("uint16"): uint16,
np.dtype("uint32"): uint32,
np.dtype("uint64"): uint64,
np.dtype("float16"): float16,
np.dtype("float32"): float32,
np.dtype("float64"): float64,
np.dtype("double"): float64,
np.dtype("unicode"): string,
np.dtype("str"): string,
np.dtype("datetime64"): timestamp,
np.dtype("datetime64[ns]"): timestamp,
np.dtype("timedelta64"): interval,
np.dtype("timedelta64[ns]"): Interval("ns"),
}


@dtype.register(np.dtype)
def _(value):
try:
return _numpy_dtypes[value]
except KeyError:
raise TypeError(f"numpy dtype {value!r} is not supported")


public(
null=null,
boolean=boolean,
Expand Down

0 comments on commit 2e56540

Please sign in to comment.