19 changes: 19 additions & 0 deletions ibis/backends/pyarrow/datatypes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import functools
from typing import Iterable

import pyarrow as pa

Expand All @@ -24,6 +25,7 @@
dt.Binary: pa.binary(),
dt.Boolean: pa.bool_(),
dt.Timestamp: pa.timestamp('ns'),
dt.Date: pa.date64(),
}


Expand Down Expand Up @@ -129,3 +131,20 @@ def infer_pyarrow_schema(schema: pa.Schema) -> sch.Schema:
return sch.schema(
[(f.name, dt.dtype(f.type, nullable=f.nullable)) for f in schema]
)


def _schema_to_pyarrow_schema_fields(schema: sch.Schema) -> Iterable[pa.Field]:
for name, dtype in schema.items():
yield pa.field(name, dtype.to_pyarrow(), nullable=dtype.nullable)


def ibis_to_pyarrow_struct(schema: sch.Schema) -> pa.StructType:
return pa.struct(_schema_to_pyarrow_schema_fields(schema))


def ibis_to_pyarrow_schema(schema: sch.Schema) -> pa.Schema:
return pa.schema(_schema_to_pyarrow_schema_fields(schema))


dt.DataType.to_pyarrow = to_pyarrow_type # type: ignore
sch.Schema.to_pyarrow = ibis_to_pyarrow_schema # type: ignore
195 changes: 195 additions & 0 deletions ibis/backends/tests/test_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
import sys

import pyarrow as pa
import pytest
from pytest import param

# Adds `to_pyarrow` to created schema objects
from ibis.backends.pyarrow.datatypes import sch # noqa: F401


class PackageDiscarder:
def __init__(self):
self.pkgnames = []

def find_spec(self, fullname, path, target=None):
if fullname in self.pkgnames:
raise ImportError()


@pytest.fixture
def no_pyarrow(backend):
_pyarrow = sys.modules.pop('pyarrow', None)
d = PackageDiscarder()
d.pkgnames.append('pyarrow')
sys.meta_path.insert(0, d)
yield
sys.meta_path.remove(d)
if _pyarrow is not None:
sys.modules["pyarrow"] = _pyarrow


limit = [
param(
42,
id='limit',
marks=[
pytest.mark.notimpl(
[
# limit not implemented for pandas backend execution
"clickhouse",
"dask",
"datafusion",
"impala",
"pandas",
"pyspark",
"snowflake",
]
),
],
),
]

no_limit = [
param(
None,
id='nolimit',
marks=[
pytest.mark.notimpl(
[
"clickhouse",
"dask",
"impala",
"pyspark",
"snowflake",
]
),
],
),
]

limit_no_limit = limit + no_limit


@pytest.mark.notyet(
["pandas"], reason="DataFrames have no option for outputting in batches"
)
@pytest.mark.parametrize("limit", limit_no_limit)
def test_table_to_pyarrow_batches(limit, backend, awards_players):
batch_reader = awards_players.to_pyarrow_batches(limit=limit)
assert isinstance(batch_reader, pa.RecordBatchReader)
batch = batch_reader.read_next_batch()
assert isinstance(batch, pa.RecordBatch)
if limit is not None:
assert len(batch) == limit


@pytest.mark.notyet(
["pandas"], reason="DataFrames have no option for outputting in batches"
)
@pytest.mark.parametrize("limit", limit_no_limit)
def test_column_to_pyarrow_batches(limit, backend, awards_players):
batch_reader = awards_players.awardID.to_pyarrow_batches(limit=limit)
assert isinstance(batch_reader, pa.RecordBatchReader)
batch = batch_reader.read_next_batch()
assert isinstance(batch, pa.RecordBatch)
if limit is not None:
assert len(batch) == limit


@pytest.mark.parametrize("limit", limit_no_limit)
def test_table_to_pyarrow_table(limit, backend, awards_players):
table = awards_players.to_pyarrow(limit=limit)
assert isinstance(table, pa.Table)
if limit is not None:
assert len(table) == limit


@pytest.mark.parametrize("limit", limit_no_limit)
def test_column_to_pyarrow_array(limit, backend, awards_players):
array = awards_players.awardID.to_pyarrow(limit=limit)
assert isinstance(array, pa.Array)
if limit is not None:
assert len(array) == limit


@pytest.mark.notyet(
["datafusion"], reason="DataFusion backend doesn't support sum"
)
@pytest.mark.parametrize("limit", no_limit)
def test_scalar_to_pyarrow_scalar(limit, backend, awards_players):
scalar = awards_players.yearID.sum().to_pyarrow(limit=limit)
assert isinstance(scalar, pa.Scalar)


@pytest.mark.notimpl(["dask", "clickhouse", "impala", "pyspark"])
@pytest.mark.notyet(
["datafusion"],
reason="""
fields' nullability from frame.schema() is not always consistent with
the first record batch's schema
""",
)
def test_table_to_pyarrow_table_schema(backend, awards_players):
table = awards_players.to_pyarrow()
assert isinstance(table, pa.Table)
assert table.schema == awards_players.schema().to_pyarrow()


@pytest.mark.notimpl(["dask", "clickhouse", "impala", "pyspark"])
def test_column_to_pyarrow_table_schema(backend, awards_players):
expr = awards_players.awardID
array = expr.to_pyarrow()
assert isinstance(array, pa.Array)
assert array.type == expr.type().to_pyarrow()


@pytest.mark.notimpl(
["pandas", "dask", "clickhouse", "impala", "pyspark", "datafusion"]
)
def test_table_pyarrow_batch_chunk_size(backend, awards_players):
batch_reader = awards_players.to_pyarrow_batches(
limit=2050, chunk_size=2048
)
assert isinstance(batch_reader, pa.RecordBatchReader)
batch = batch_reader.read_next_batch()
assert isinstance(batch, pa.RecordBatch)
assert len(batch) == 2048


@pytest.mark.notimpl(
["pandas", "dask", "clickhouse", "impala", "pyspark", "datafusion"]
)
def test_column_pyarrow_batch_chunk_size(backend, awards_players):
batch_reader = awards_players.awardID.to_pyarrow_batches(
limit=2050, chunk_size=2048
)
assert isinstance(batch_reader, pa.RecordBatchReader)
batch = batch_reader.read_next_batch()
assert isinstance(batch, pa.RecordBatch)
assert len(batch) == 2048


@pytest.mark.notimpl(
["pandas", "dask", "clickhouse", "impala", "pyspark", "datafusion"]
)
@pytest.mark.broken(
["sqlite"],
raises=pa.ArrowException,
reason="Test data has empty strings in columns typed as int64",
)
def test_to_pyarrow_batches_borked_types(backend, batting):
"""This is a temporary test to expose an(other) issue with sqlite typing
shenanigans."""
batch_reader = batting.to_pyarrow_batches(limit=42)
assert isinstance(batch_reader, pa.RecordBatchReader)
batch = batch_reader.read_next_batch()
assert isinstance(batch, pa.RecordBatch)
assert len(batch) == 42


def test_no_pyarrow_message(backend, awards_players, no_pyarrow):
with pytest.raises(ModuleNotFoundError) as excinfo:
awards_players.to_pyarrow()

assert "requires `pyarrow` but" in str(excinfo.value)
71 changes: 70 additions & 1 deletion ibis/expr/types/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
import webbrowser
from typing import TYPE_CHECKING, Any, Mapping
from typing import TYPE_CHECKING, Any, Iterable, Mapping

import toolz
from public import public
Expand All @@ -16,6 +16,8 @@
from ibis.util import UnnamedMarker

if TYPE_CHECKING:
import pyarrow as pa

import ibis.expr.types as ir
from ibis.backends.base import BaseBackend

Expand Down Expand Up @@ -306,6 +308,73 @@ def compile(
self, limit=limit, timecontext=timecontext, params=params
)

def to_pyarrow_batches(
self,
*,
limit: int | str | None = None,
params: Mapping[ir.Value, Any] | None = None,
chunk_size: int = 1_000_000,
**kwargs: Any,
) -> Iterable[pa.RecordBatch]:
"""Execute expression and return results in an iterator of pyarrow
record batches.
**Warning**: This method is eager and will execute the associated
expression immediately. This API is experimental and subject to change.
Parameters
----------
limit
An integer to effect a specific row limit. A value of `None` means
"no limit". The default is in `ibis/config.py`.
params
Mapping of scalar parameter expressions to value.
chunk_size
Number of rows in each returned record batch.
Returns
-------
record_batches
An iterator of pyarrow record batches.
"""
return self._find_backend().to_pyarrow_batches(
self,
params=params,
limit=limit,
chunk_size=chunk_size,
**kwargs,
)

def to_pyarrow(
self,
*,
params: Mapping[ir.Scalar, Any] | None = None,
limit: int | str | None = None,
**kwargs: Any,
) -> pa.Table:
"""Execute expression and return results in as a pyarrow table.
**Warning**: This method is eager and will execute the associated
expression immediately. This API is experimental and subject to change.
Parameters
----------
limit
An integer to effect a specific row limit. A value of `None` means
"no limit". The default is in `ibis/config.py`.
params
Mapping of scalar parameter expressions to value.
Returns
-------
Table
A pyarrow table holding the results of the executed expression.
"""
return self._find_backend().to_pyarrow(
self, params=params, limit=limit, **kwargs
)


unnamed = UnnamedMarker()

Expand Down
8 changes: 4 additions & 4 deletions ibis/expr/types/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@

from typing import TYPE_CHECKING, Any, Iterable, Literal, Sequence

if TYPE_CHECKING:
import ibis.expr.types as ir
import ibis.expr.window as win

from public import public
from rich.jupyter import JupyterMixin

Expand All @@ -15,6 +11,10 @@
import ibis.expr.operations as ops
from ibis.expr.types.core import Expr, _binop

if TYPE_CHECKING:
import ibis.expr.types as ir
import ibis.expr.window as win


@public
class Value(Expr):
Expand Down