Skip to content

Commit

Permalink
apacheGH-39651: [Python] Basic pyarrow bindings for Binary/StringView…
Browse files Browse the repository at this point in the history
… classes (apache#39652)

### Rationale for this change

First step for apache#39633: exposing the Array, DataType and Scalar classes for BinaryView and StringView, such that those can already be represented in pyarrow.

(I exposed a variant of StringBuilder as well, just for now to be able to create test data)

* Closes: apache#39651

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
jorisvandenbossche committed Jan 30, 2024
1 parent c6ab286 commit 787afa1
Show file tree
Hide file tree
Showing 16 changed files with 223 additions and 6 deletions.
4 changes: 4 additions & 0 deletions docs/source/python/api/arrays.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ may expose data type-specific methods or properties.
FixedSizeBinaryArray
LargeBinaryArray
LargeStringArray
BinaryViewArray,
StringViewArray,
Time32Array
Time64Array
Date32Array
Expand Down Expand Up @@ -119,6 +121,8 @@ classes may expose data type-specific methods or properties.
FixedSizeBinaryScalar
LargeBinaryScalar
LargeStringScalar
BinaryViewScalar
StringViewScalar
Time32Scalar
Time64Scalar
Date32Scalar
Expand Down
4 changes: 4 additions & 0 deletions docs/source/python/api/datatypes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ These should be used to create Arrow data types and schemas.
large_binary
large_string
large_utf8
binary_view
string_view
decimal128
list_
large_list
Expand Down Expand Up @@ -168,6 +170,8 @@ represents a given data type (such as ``int32``) or general category
is_large_binary
is_large_unicode
is_large_string
is_binary_view
is_string_view
is_fixed_size_binary
is_map
is_dictionary
Expand Down
7 changes: 4 additions & 3 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def print_entry(label, value):
time32, time64, timestamp, date32, date64, duration,
month_day_nano_interval,
float16, float32, float64,
binary, string, utf8,
binary, string, utf8, binary_view, string_view,
large_binary, large_string, large_utf8,
decimal128, decimal256,
list_, large_list, map_, struct,
Expand Down Expand Up @@ -205,6 +205,7 @@ def print_entry(label, value):
FixedSizeListArray, UnionArray,
BinaryArray, StringArray,
LargeBinaryArray, LargeStringArray,
BinaryViewArray, StringViewArray,
FixedSizeBinaryArray,
DictionaryArray,
Date32Array, Date64Array, TimestampArray,
Expand All @@ -223,8 +224,8 @@ def print_entry(label, value):
Time32Scalar, Time64Scalar,
TimestampScalar, DurationScalar,
MonthDayNanoIntervalScalar,
BinaryScalar, LargeBinaryScalar,
StringScalar, LargeStringScalar,
BinaryScalar, LargeBinaryScalar, BinaryViewScalar,
StringScalar, LargeStringScalar, StringViewScalar,
FixedSizeBinaryScalar, DictionaryScalar,
MapScalar, StructScalar, UnionScalar,
RunEndEncodedScalar, ExtensionScalar)
Expand Down
14 changes: 14 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -2942,6 +2942,12 @@ cdef class LargeStringArray(Array):
null_count, offset)


cdef class StringViewArray(Array):
"""
Concrete class for Arrow arrays of string (or utf8) view data type.
"""


cdef class BinaryArray(Array):
"""
Concrete class for Arrow arrays of variable-sized binary data type.
Expand All @@ -2968,6 +2974,12 @@ cdef class LargeBinaryArray(Array):
return (<CLargeBinaryArray*> self.ap).total_values_length()


cdef class BinaryViewArray(Array):
"""
Concrete class for Arrow arrays of variable-sized binary view data type.
"""


cdef class DictionaryArray(Array):
"""
Concrete class for dictionary-encoded Arrow arrays.
Expand Down Expand Up @@ -3669,6 +3681,8 @@ cdef dict _array_classes = {
_Type_STRING: StringArray,
_Type_LARGE_BINARY: LargeBinaryArray,
_Type_LARGE_STRING: LargeStringArray,
_Type_BINARY_VIEW: BinaryViewArray,
_Type_STRING_VIEW: StringViewArray,
_Type_DICTIONARY: DictionaryArray,
_Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray,
_Type_DECIMAL128: Decimal128Array,
Expand Down
66 changes: 66 additions & 0 deletions python/pyarrow/builder.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,69 @@ cdef class StringBuilder(_Weakrefable):

def __len__(self):
return self.builder.get().length()


cdef class StringViewBuilder(_Weakrefable):
"""
Builder class for UTF8 string views.
This class exposes facilities for incrementally adding string values and
building the null bitmap for a pyarrow.Array (type='string_view').
"""
cdef:
unique_ptr[CStringViewBuilder] builder

def __cinit__(self, MemoryPool memory_pool=None):
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
self.builder.reset(new CStringViewBuilder(pool))

def append(self, value):
"""
Append a single value to the builder.
The value can either be a string/bytes object or a null value
(np.nan or None).
Parameters
----------
value : string/bytes or np.nan/None
The value to append to the string array builder.
"""
if value is None or value is np.nan:
self.builder.get().AppendNull()
elif isinstance(value, (bytes, str)):
self.builder.get().Append(tobytes(value))
else:
raise TypeError('StringViewBuilder only accepts string objects')

def append_values(self, values):
"""
Append all the values from an iterable.
Parameters
----------
values : iterable of string/bytes or np.nan/None values
The values to append to the string array builder.
"""
for value in values:
self.append(value)

def finish(self):
"""
Return result of builder as an Array object; also resets the builder.
Returns
-------
array : pyarrow.Array
"""
cdef shared_ptr[CArray] out
with nogil:
self.builder.get().Finish(&out)
return pyarrow_wrap_array(out)

@property
def null_count(self):
return self.builder.get().null_count()

def __len__(self):
return self.builder.get().length()
9 changes: 9 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
_Type_LARGE_BINARY" arrow::Type::LARGE_BINARY"
_Type_LARGE_STRING" arrow::Type::LARGE_STRING"
_Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY"
_Type_BINARY_VIEW" arrow::Type::BINARY_VIEW"
_Type_STRING_VIEW" arrow::Type::STRING_VIEW"

_Type_LIST" arrow::Type::LIST"
_Type_LARGE_LIST" arrow::Type::LARGE_LIST"
Expand Down Expand Up @@ -1295,7 +1297,14 @@ cdef extern from "arrow/builder.h" namespace "arrow" nogil:

cdef cppclass CStringBuilder" arrow::StringBuilder"(CBinaryBuilder):
CStringBuilder(CMemoryPool* pool)
CStatus Append(const c_string& value)

cdef cppclass CBinaryViewBuilder" arrow::BinaryViewBuilder"(CArrayBuilder):
CBinaryViewBuilder(shared_ptr[CDataType], CMemoryPool* pool)
CStatus Append(const char* value, int32_t length)

cdef cppclass CStringViewBuilder" arrow::StringViewBuilder"(CBinaryViewBuilder):
CStringViewBuilder(CMemoryPool* pool)
CStatus Append(const c_string& value)

cdef cppclass CTimestampBuilder "arrow::TimestampBuilder"(CArrayBuilder):
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/lib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,14 @@ cdef class BinaryArray(Array):
pass


cdef class StringViewArray(Array):
pass


cdef class BinaryViewArray(Array):
pass


cdef class DictionaryArray(Array):
cdef:
object _indices, _dictionary
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ Type_STRING = _Type_STRING
Type_LARGE_BINARY = _Type_LARGE_BINARY
Type_LARGE_STRING = _Type_LARGE_STRING
Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
Type_BINARY_VIEW = _Type_BINARY_VIEW
Type_STRING_VIEW = _Type_STRING_VIEW
Type_LIST = _Type_LIST
Type_LARGE_LIST = _Type_LARGE_LIST
Type_MAP = _Type_MAP
Expand Down
10 changes: 10 additions & 0 deletions python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,14 @@ cdef class LargeStringScalar(StringScalar):
pass


cdef class BinaryViewScalar(BinaryScalar):
pass


cdef class StringViewScalar(StringScalar):
pass


cdef class ListScalar(Scalar):
"""
Concrete class for list-like scalars.
Expand Down Expand Up @@ -1051,8 +1059,10 @@ cdef dict _scalar_classes = {
_Type_BINARY: BinaryScalar,
_Type_LARGE_BINARY: LargeBinaryScalar,
_Type_FIXED_SIZE_BINARY: FixedSizeBinaryScalar,
_Type_BINARY_VIEW: BinaryViewScalar,
_Type_STRING: StringScalar,
_Type_LARGE_STRING: LargeStringScalar,
_Type_STRING_VIEW: StringViewScalar,
_Type_LIST: ListScalar,
_Type_LARGE_LIST: LargeListScalar,
_Type_FIXED_SIZE_LIST: FixedSizeListScalar,
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/src/arrow/python/helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
GET_PRIMITIVE_TYPE(STRING, utf8);
GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary);
GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8);
GET_PRIMITIVE_TYPE(BINARY_VIEW, binary_view);
GET_PRIMITIVE_TYPE(STRING_VIEW, utf8_view);
GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval);
default:
return nullptr;
Expand Down
21 changes: 20 additions & 1 deletion python/pyarrow/tests/test_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import numpy as np

import pyarrow as pa
from pyarrow.lib import StringBuilder
from pyarrow.lib import StringBuilder, StringViewBuilder


def test_weakref():
Expand Down Expand Up @@ -65,3 +65,22 @@ def test_string_builder_append_after_finish():
sbuilder.append("No effect")
expected = [None, None, "text", None, "other text"]
assert arr.to_pylist() == expected


def test_string_view_builder():
builder = StringViewBuilder()
builder.append(b"a byte string")
builder.append("a string")
builder.append("a longer not-inlined string")
builder.append(np.nan)
builder.append_values([None, "text"])
assert len(builder) == 6
assert builder.null_count == 2
arr = builder.finish()
assert isinstance(arr, pa.Array)
assert arr.null_count == 2
assert arr.type == 'string_view'
expected = [
"a byte string", "a string", "a longer not-inlined string", None, None, "text"
]
assert arr.to_pylist() == expected
4 changes: 4 additions & 0 deletions python/pyarrow/tests/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ def test_set_timezone_db_path_non_windows():
pa.UnionArray,
pa.BinaryArray,
pa.StringArray,
pa.BinaryViewArray,
pa.StringViewArray,
pa.FixedSizeBinaryArray,
pa.DictionaryArray,
pa.Date32Array,
Expand Down Expand Up @@ -221,6 +223,8 @@ def test_set_timezone_db_path_non_windows():
pa.StringScalar,
pa.BinaryScalar,
pa.FixedSizeBinaryScalar,
pa.BinaryViewScalar,
pa.StringViewScalar,
pa.ListScalar,
pa.LargeListScalar,
pa.MapScalar,
Expand Down
28 changes: 26 additions & 2 deletions python/pyarrow/tests/test_scalars.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@
(b"bytes", None, pa.BinaryScalar),
("largestring", pa.large_string(), pa.LargeStringScalar),
(b"largebytes", pa.large_binary(), pa.LargeBinaryScalar),
# TODO(GH-39633) pa.scalar(..) requires python->arrow conversion to be implemented
# ("string_view", pa.string_view(), pa.StringViewScalar),
# (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar),
(b"abc", pa.binary(3), pa.FixedSizeBinaryScalar),
([1, 2, 3], None, pa.ListScalar),
([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar),
Expand Down Expand Up @@ -488,7 +491,8 @@ def test_month_day_nano_interval():
@pytest.mark.parametrize('value', ['foo', 'mañana'])
@pytest.mark.parametrize(('ty', 'scalar_typ'), [
(pa.string(), pa.StringScalar),
(pa.large_string(), pa.LargeStringScalar)
(pa.large_string(), pa.LargeStringScalar),
# (pa.string_view(), pa.StringViewScalar),
])
def test_string(value, ty, scalar_typ):
s = pa.scalar(value, type=ty)
Expand All @@ -503,10 +507,30 @@ def test_string(value, ty, scalar_typ):
assert buf.to_pybytes() == value.encode()


@pytest.mark.parametrize('value', ['foo', 'mañana'])
def test_string_view(value):
# TODO: replace with normal scalar construction
builder = pa.lib.StringViewBuilder()
builder.append(value)
arr = builder.finish()

s = arr[0]
assert isinstance(s, pa.StringViewScalar)
assert s.as_py() == value
assert s.as_py() != 'something'
assert repr(value) in repr(s)
assert str(s) == str(value)

buf = s.as_buffer()
assert isinstance(buf, pa.Buffer)
assert buf.to_pybytes() == value.encode()


@pytest.mark.parametrize('value', [b'foo', b'bar'])
@pytest.mark.parametrize(('ty', 'scalar_typ'), [
(pa.binary(), pa.BinaryScalar),
(pa.large_binary(), pa.LargeBinaryScalar)
(pa.large_binary(), pa.LargeBinaryScalar),
# (pa.binary_view(), pa.BinaryViewScalar),
])
def test_binary(value, ty, scalar_typ):
s = pa.scalar(value, type=ty)
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/tests/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def get_many_types():
pa.binary(10),
pa.large_string(),
pa.large_binary(),
pa.string_view(),
pa.binary_view(),
pa.list_(pa.int32()),
pa.list_(pa.int32(), 2),
pa.large_list(pa.uint16()),
Expand Down Expand Up @@ -244,6 +246,12 @@ def test_is_binary_string():
assert types.is_fixed_size_binary(pa.binary(5))
assert not types.is_fixed_size_binary(pa.binary())

assert types.is_string_view(pa.string_view())
assert not types.is_string_view(pa.string())
assert types.is_binary_view(pa.binary_view())
assert not types.is_binary_view(pa.binary())
assert not types.is_binary_view(pa.string_view())


def test_is_temporal_date_time_timestamp():
date_types = [pa.date32(), pa.date64()]
Expand Down
Loading

0 comments on commit 787afa1

Please sign in to comment.