refactor: use lazy dispatch for dt.infer

ibis-project · Nov 29, 2022 · 2e56540 · 2e56540
1 parent 180ecff
commit 2e56540
Show file tree

Hide file tree

Showing 11 changed files with 183 additions and 334 deletions.
diff --git a/ibis/backends/dask/client.py b/ibis/backends/dask/client.py
@@ -2,7 +2,6 @@
 
 import dask.dataframe as dd
 import numpy as np
-import pandas as pd
 from pandas.api.types import DatetimeTZDtype
 
 import ibis.expr.datatypes as dt
@@ -12,16 +11,10 @@
 from ibis.backends.pandas.client import (
     PANDAS_DATE_TYPES,
     PANDAS_STRING_TYPES,
-    _inferable_pandas_dtypes,
     ibis_dtype_to_pandas,
     ibis_schema_to_pandas,
 )
 
-infer_dask_dtype = pd.api.types.infer_dtype
-
-
-_inferable_dask_dtypes = _inferable_pandas_dtypes
-
 
 @sch.schema.register(dd.Series)
 def schema_from_series(s):
@@ -40,16 +33,11 @@ def infer_dask_schema(df, schema=None):
         if column_name in schema:
             ibis_dtype = dt.dtype(schema[column_name])
         elif dask_dtype == np.object_:
-            inferred_dtype = infer_dask_dtype(df[column_name].compute(), skipna=True)
-            if inferred_dtype in {'mixed', 'decimal'}:
-                # TODO: in principal we can handle decimal (added in pandas
-                # 0.23)
-                raise TypeError(
-                    'Unable to infer type of column {0!r}. Try instantiating '
-                    'your table from the client with client.table('
-                    "'my_table', schema={{{0!r}: <explicit type>}})".format(column_name)
-                )
-            ibis_dtype = _inferable_dask_dtypes[inferred_dtype]
+            # TODO: don't call compute here. ibis should just assume that
+            # object dtypes are strings, which is what dask does. The user
+            # can always explicitly pass in `schema=...` when creating a
+            # table if they want to use a different dtype.
+            ibis_dtype = dt.infer(df[column_name].compute()).value_type
         else:
             ibis_dtype = dt.dtype(dask_dtype)
 

diff --git a/ibis/backends/dask/tests/execution/test_operations.py b/ibis/backends/dask/tests/execution/test_operations.py
@@ -28,11 +28,6 @@ def test_literal(client):
     assert client.execute(ibis.literal(1)) == 1
 
 
-def test_read_with_undiscoverable_type(client):
-    with pytest.raises(TypeError):
-        client.table('df')
-
-
 def test_selection(t, df):
     expr = t[((t.plain_strings == 'a') | (t.plain_int64 == 3)) & (t.dup_strings == 'd')]
     result = expr.compile()

diff --git a/ibis/backends/dask/tests/test_client.py b/ibis/backends/dask/tests/test_client.py
@@ -72,11 +72,6 @@ def test_list_tables(client):
     assert client.list_tables()
 
 
-def test_read_with_undiscoverable_type(client):
-    with pytest.raises(TypeError):
-        client.table('df_unknown')
-
-
 def test_drop(table):
     table = table.mutate(c=table.a)
     expr = table.drop('a')

diff --git a/ibis/backends/dask/tests/test_core.py b/ibis/backends/dask/tests/test_core.py
@@ -161,10 +161,6 @@ def is_computable_input_my_object(_):
     execute_node.reorder()
     execute_node._cache.clear()
 
-    del dt.infer.funcs[(MyObject,)]
-    dt.infer.reorder()
-    dt.infer._cache.clear()
-
 
 def test_scope_look_up():
     # test if scope could lookup items properly

diff --git a/ibis/backends/dask/tests/test_datatypes.py b/ibis/backends/dask/tests/test_datatypes.py
@@ -1,7 +1,6 @@
 import numpy as np
 import pandas as pd
 import pytest
-from multipledispatch.conflict import ambiguities
 from pandas.api.types import CategoricalDtype, DatetimeTZDtype
 
 import ibis
@@ -11,10 +10,6 @@
 dd = pytest.importorskip("dask.dataframe")
 
 
-def test_no_infer_ambiguities():
-    assert not ambiguities(dt.infer.funcs)
-
-
 @pytest.mark.parametrize(
     ('value', 'expected_dtype'),
     [

diff --git a/ibis/backends/pandas/client.py b/ibis/backends/pandas/client.py
@@ -1,7 +1,6 @@
 """The pandas client implementation."""
 
 import json
-from collections.abc import Mapping, Sequence
 
 import numpy as np
 import pandas as pd
@@ -16,9 +15,6 @@
 from ibis.backends.base import Database
 from ibis.common.grounds import Immutable
 
-infer_pandas_dtype = pd.api.types.infer_dtype
-
-
 _ibis_dtypes = toolz.valmap(
     np.dtype,
     {
@@ -46,64 +42,6 @@
 )
 
 
-_numpy_dtypes = toolz.keymap(
-    np.dtype,
-    {
-        'bool': dt.boolean,
-        'int8': dt.int8,
-        'int16': dt.int16,
-        'int32': dt.int32,
-        'int64': dt.int64,
-        'uint8': dt.uint8,
-        'uint16': dt.uint16,
-        'uint32': dt.uint32,
-        'uint64': dt.uint64,
-        'float16': dt.float16,
-        'float32': dt.float32,
-        'float64': dt.float64,
-        'double': dt.double,
-        'unicode': dt.string,
-        'str': dt.string,
-        'datetime64': dt.timestamp,
-        'datetime64[ns]': dt.timestamp,
-        'timedelta64': dt.interval,
-        'timedelta64[ns]': dt.Interval('ns'),
-    },
-)
-
-
-_inferable_pandas_dtypes = {
-    'string': dt.string,
-    'bytes': dt.string,
-    'floating': dt.float64,
-    'integer': dt.int64,
-    'mixed-integer': dt.binary,
-    'mixed-integer-float': dt.float64,
-    'decimal': dt.float64,
-    'complex': dt.binary,
-    'categorical': dt.category,
-    'boolean': dt.boolean,
-    'datetime64': dt.timestamp,
-    'datetime': dt.timestamp,
-    'date': dt.date,
-    'timedelta64': dt.interval,
-    'timedelta': dt.interval,
-    'time': dt.time,
-    'period': dt.binary,
-    'mixed': dt.binary,
-    'empty': dt.binary,
-    'unicode': dt.string,
-}
-
-
-@dt.dtype.register(np.dtype)
-def from_numpy_dtype(value):
-    try:
-        return _numpy_dtypes[value]
-    except KeyError:
-        raise TypeError(f'numpy dtype {value!r} is not supported in the pandas backend')
-
-
 @dt.dtype.register(DatetimeTZDtype)
 def from_pandas_tzdtype(value):
     return dt.Timestamp(timezone=str(value.tz))
@@ -119,85 +57,6 @@ def from_pandas_string(_):
     return dt.String()
 
 
-@dt.infer.register(np.generic)
-def infer_numpy_scalar(value):
-    return dt.dtype(value.dtype)
-
-
-def _infer_pandas_series_contents(s: pd.Series) -> dt.DataType:
-    """Infer the type of the **contents** of a pd.Series.
-
-    No dispatch for this because there is no class representing "the contents
-    of a Series". Instead, this is meant to be used internally, mainly by
-    `infer_pandas_series`.
-
-    Parameters
-    ----------
-    s : pd.Series
-        The Series whose contents we want to know the type of
-
-    Returns
-    -------
-    dtype : dt.DataType
-        The dtype of the contents of the Series
-    """
-    if s.dtype == np.object_:
-        inferred_dtype = infer_pandas_dtype(s, skipna=True)
-        if inferred_dtype == 'mixed':
-            # We need to inspect an element to determine the Ibis dtype
-            value = s.iloc[0]
-            if isinstance(value, (np.ndarray, pd.Series, Sequence, Mapping)):
-                # Defer to individual `infer` functions for these
-                return dt.infer(value)
-            else:
-                return dt.dtype('binary')
-        else:
-            return _inferable_pandas_dtypes[inferred_dtype]
-    else:
-        return dt.dtype(s.dtype)
-
-
-@dt.infer.register(pd.Series)
-def infer_pandas_series(s):
-    """Infer the type of a pd.Series.
-
-    Note that the returned datatype will be an array type, which
-    corresponds to the fact that a Series is a collection of elements.
-    Please use `_infer_pandas_series_contents` if you are interested in
-    the datatype of the **contents** of the Series.
-    """
-    return dt.Array(_infer_pandas_series_contents(s))
-
-
-@dt.infer.register(pd.Timestamp)
-def infer_pandas_timestamp(value):
-    if value.tz is not None:
-        return dt.Timestamp(timezone=str(value.tz))
-    else:
-        return dt.timestamp
-
-
-@dt.infer.register(np.ndarray)
-def infer_array(value):
-    # In this function, by default we'll directly map the dtype of the
-    # np.array to a corresponding Ibis dtype (see bottom)
-    np_dtype = value.dtype
-
-    # However, there are some special cases where we can't use the np.array's
-    # dtype:
-    if np_dtype.type == np.object_:
-        # np.array dtype is `dtype('O')`, which is ambiguous.
-        inferred_dtype = infer_pandas_dtype(value, skipna=True)
-        return dt.Array(_inferable_pandas_dtypes[inferred_dtype])
-    elif np_dtype.type == np.str_:
-        # np.array dtype is `dtype('<U1')` (for np.arrays containing strings),
-        # which is ambiguous.
-        return dt.Array(dt.string)
-
-    # The dtype of the np.array is not ambiguous, and can be used directly.
-    return dt.Array(dt.dtype(np_dtype))
-
-
 @sch.schema.register(pd.Series)
 def schema_from_series(s):
     return sch.schema(tuple(s.items()))
@@ -215,7 +74,7 @@ def infer_pandas_schema(df, schema=None):
         if column_name in schema:
             ibis_dtype = dt.dtype(schema[column_name])
         else:
-            ibis_dtype = _infer_pandas_series_contents(df[column_name])
+            ibis_dtype = dt.infer(df[column_name]).value_type
 
         pairs.append((column_name, ibis_dtype))
 

diff --git a/ibis/backends/pandas/tests/test_core.py b/ibis/backends/pandas/tests/test_core.py
@@ -175,10 +175,6 @@ def is_computable_input_my_object(_):
     execute_node.reorder()
     execute_node._cache.clear()
 
-    del dt.infer.funcs[(MyObject,)]
-    dt.infer.reorder()
-    dt.infer._cache.clear()
-
 
 def test_scope_look_up():
     # test if scope could lookup items properly

diff --git a/ibis/backends/pandas/tests/test_datatypes.py b/ibis/backends/pandas/tests/test_datatypes.py
@@ -4,18 +4,13 @@
 import numpy as np
 import pandas as pd
 import pytest
-from multipledispatch.conflict import ambiguities
 from pandas.api.types import CategoricalDtype, DatetimeTZDtype
 
 import ibis
 import ibis.expr.datatypes as dt
 import ibis.expr.schema as sch
 
 
-def test_no_infer_ambiguities():
-    assert not ambiguities(dt.infer.funcs)
-
-
 @pytest.mark.parametrize(
     ('value', 'expected_dtype'),
     [

diff --git a/ibis/expr/datatypes/core.py b/ibis/expr/datatypes/core.py
@@ -3,6 +3,7 @@
 import numbers
 from typing import Any, Iterable, Mapping, NamedTuple
 
+import numpy as np
 from multipledispatch import Dispatcher
 from public import public
 
@@ -950,6 +951,37 @@ class INET(String):
 
 Enum = String
 
+_numpy_dtypes = {
+    np.dtype("bool"): boolean,
+    np.dtype("int8"): int8,
+    np.dtype("int16"): int16,
+    np.dtype("int32"): int32,
+    np.dtype("int64"): int64,
+    np.dtype("uint8"): uint8,
+    np.dtype("uint16"): uint16,
+    np.dtype("uint32"): uint32,
+    np.dtype("uint64"): uint64,
+    np.dtype("float16"): float16,
+    np.dtype("float32"): float32,
+    np.dtype("float64"): float64,
+    np.dtype("double"): float64,
+    np.dtype("unicode"): string,
+    np.dtype("str"): string,
+    np.dtype("datetime64"): timestamp,
+    np.dtype("datetime64[ns]"): timestamp,
+    np.dtype("timedelta64"): interval,
+    np.dtype("timedelta64[ns]"): Interval("ns"),
+}
+
+
+@dtype.register(np.dtype)
+def _(value):
+    try:
+        return _numpy_dtypes[value]
+    except KeyError:
+        raise TypeError(f"numpy dtype {value!r} is not supported")
+
+
 public(
     null=null,
     boolean=boolean,