diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py index d45e25524bbb..b27c49f33aed 100644 --- a/ibis/backends/duckdb/__init__.py +++ b/ibis/backends/duckdb/__init__.py @@ -22,6 +22,7 @@ from ibis.backends.base.sql.alchemy import BaseAlchemyBackend from ibis.backends.duckdb.compiler import DuckDBSQLCompiler from ibis.backends.duckdb.datatypes import parse +from ibis.backends.pandas.client import DataFrameProxy if TYPE_CHECKING: import pandas as pd @@ -418,7 +419,7 @@ def read_in_memory( ir.Table The just-registered table """ - table_name = table_name or f"ibis_read_in_memory_{next(pd_n)}" + table_name = table_name or util.gen_name("read_in_memory") with self.begin() as con: con.connection.register(table_name, dataframe) @@ -710,6 +711,8 @@ def _metadata(self, query: str) -> Iterator[tuple[str, dt.DataType]]: def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: # in theory we could use pandas dataframes, but when using dataframes # with pyarrow datatypes later reads of this data segfault + import pandas as pd + schema = op.schema if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]: raise exc.IbisTypeError( @@ -717,10 +720,34 @@ def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: f"got null typed columns: {null_columns}" ) - name = op.name - table = op.data.to_pyarrow(schema) - with self.begin() as con: - con.connection.register(name, table) + # only register if we haven't already done so + if (name := op.name) not in self.list_tables(): + if isinstance(data := op.data, DataFrameProxy): + table = data.to_frame() + + # convert to object string dtypes because duckdb is either + # 1. extremely slow to register DataFrames with not-pyarrow + # string dtypes + # 2. broken for string[pyarrow] dtypes (segfault) + if conversions := { + colname: "str" + for colname, col in table.items() + if isinstance(col.dtype, pd.StringDtype) + }: + table = table.astype(conversions) + else: + table = data.to_pyarrow(schema) + + # register creates a transaction, and we can't nest transactions so + # we create a function to encapsulate the whole shebang + def _register(name, table): + with self.begin() as con: + con.connection.register(name, table) + + try: + _register(name, table) + except duckdb.NotImplementedException: + _register(name, data.to_pyarrow(schema)) def _get_sqla_table( self, name: str, schema: str | None = None, **kwargs: Any diff --git a/ibis/backends/duckdb/tests/test_register.py b/ibis/backends/duckdb/tests/test_register.py index d601d87c2bad..34c3fb7ce678 100644 --- a/ibis/backends/duckdb/tests/test_register.py +++ b/ibis/backends/duckdb/tests/test_register.py @@ -3,7 +3,9 @@ import tempfile from pathlib import Path +import numpy as np import pandas as pd +import pandas.testing as tm import pyarrow as pa import pytest @@ -211,3 +213,16 @@ def test_s3_403_fallback(con, httpserver, monkeypatch): # pyarrow, which indicates the fallback worked with pytest.raises(pa.lib.ArrowInvalid): con.read_parquet(httpserver.url_for("/myfile")) + + +@pytest.mark.broken( + ["duckdb"], + reason=""" +the fix for this (issue #5879) caused a serious performance regression in the repr. +added this xfail in #5959, which also reverted the bugfix that caused the regression. + """, +) +def test_register_numpy_str(con): + data = pd.DataFrame({"a": [np.str_("xyz"), None]}) + result = con.read_in_memory(data) + tm.assert_frame_equal(result.execute(), data)