fix(duckdb): branch to avoid unnecessary dataframe construction

This and the associated revert take repr performance back to a reasonable place.
ibis-project · Apr 11, 2023 · 9d5d943 · 9d5d943
1 parent 4544dab
commit 9d5d943
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 5 deletions.
diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py
@@ -22,6 +22,7 @@
 from ibis.backends.base.sql.alchemy import BaseAlchemyBackend
 from ibis.backends.duckdb.compiler import DuckDBSQLCompiler
 from ibis.backends.duckdb.datatypes import parse
+from ibis.backends.pandas.client import DataFrameProxy
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -418,7 +419,7 @@ def read_in_memory(
         ir.Table
             The just-registered table
         """
-        table_name = table_name or f"ibis_read_in_memory_{next(pd_n)}"
+        table_name = table_name or util.gen_name("read_in_memory")
         with self.begin() as con:
             con.connection.register(table_name, dataframe)
 
@@ -710,17 +711,43 @@ def _metadata(self, query: str) -> Iterator[tuple[str, dt.DataType]]:
     def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
         # in theory we could use pandas dataframes, but when using dataframes
         # with pyarrow datatypes later reads of this data segfault
+        import pandas as pd
+
         schema = op.schema
         if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
             raise exc.IbisTypeError(
                 "DuckDB cannot yet reliably handle `null` typed columns; "
                 f"got null typed columns: {null_columns}"
             )
 
-        name = op.name
-        table = op.data.to_pyarrow(schema)
-        with self.begin() as con:
-            con.connection.register(name, table)
+        # only register if we haven't already done so
+        if (name := op.name) not in self.list_tables():
+            if isinstance(data := op.data, DataFrameProxy):
+                table = data.to_frame()
+
+                # convert to object string dtypes because duckdb is either
+                # 1. extremely slow to register DataFrames with not-pyarrow
+                #    string dtypes
+                # 2. broken for string[pyarrow] dtypes (segfault)
+                if conversions := {
+                    colname: "str"
+                    for colname, col in table.items()
+                    if isinstance(col.dtype, pd.StringDtype)
+                }:
+                    table = table.astype(conversions)
+            else:
+                table = data.to_pyarrow(schema)
+
+            # register creates a transaction, and we can't nest transactions so
+            # we create a function to encapsulate the whole shebang
+            def _register(name, table):
+                with self.begin() as con:
+                    con.connection.register(name, table)
+
+            try:
+                _register(name, table)
+            except duckdb.NotImplementedException:
+                _register(name, data.to_pyarrow(schema))
 
     def _get_sqla_table(
         self, name: str, schema: str | None = None, **kwargs: Any

diff --git a/ibis/backends/duckdb/tests/test_register.py b/ibis/backends/duckdb/tests/test_register.py
@@ -3,7 +3,9 @@
 import tempfile
 from pathlib import Path
 
+import numpy as np
 import pandas as pd
+import pandas.testing as tm
 import pyarrow as pa
 import pytest
 
@@ -211,3 +213,16 @@ def test_s3_403_fallback(con, httpserver, monkeypatch):
     # pyarrow, which indicates the fallback worked
     with pytest.raises(pa.lib.ArrowInvalid):
         con.read_parquet(httpserver.url_for("/myfile"))
+
+
+@pytest.mark.broken(
+    ["duckdb"],
+    reason="""
+the fix for this (issue #5879) caused a serious performance regression in the repr.
+added this xfail in #5959, which also reverted the bugfix that caused the regression.
+    """,
+)
+def test_register_numpy_str(con):
+    data = pd.DataFrame({"a": [np.str_("xyz"), None]})
+    result = con.read_in_memory(data)
+    tm.assert_frame_equal(result.execute(), data)