Skip to content

Commit

Permalink
apacheGH-34787: [Python] Accept zero_copy_only=False for ChunkedArray…
Browse files Browse the repository at this point in the history
….to_numpy

Signed-off-by: Julien Jerphanion <git@jjerphan.xyz>
  • Loading branch information
jjerphan committed May 13, 2023
1 parent cd6e2a4 commit 44087a1
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 28 deletions.
16 changes: 15 additions & 1 deletion python/pyarrow/table.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -471,10 +471,16 @@ cdef class ChunkedArray(_PandasConvertible):

return _array_like_to_pandas(self, options, types_mapper=types_mapper)

def to_numpy(self):
def to_numpy(self, zero_copy_only=False):
"""
Return a NumPy copy of this array (experimental).
Parameters
----------
zero_copy_only : bool, default False
Introduced for signature consistence with pyarrow.Array.to_numpy.
This must be False here since NumPy arrays' buffer must be contiguous.
Returns
-------
array : numpy.ndarray
Expand All @@ -491,6 +497,14 @@ cdef class ChunkedArray(_PandasConvertible):
PandasOptions c_options
object values

if zero_copy_only:
raise ValueError(
"zero_copy_only must be False for pyarrow.ChunkedArray.to_numpy"
)

# c_options.decode_dictionaries = False
# c_options.zero_copy_only = False

with nogil:
check_status(
ConvertChunkedArrayToPandas(
Expand Down
74 changes: 47 additions & 27 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import weakref

import numpy as np

try:
import pickle5
except ImportError:
Expand Down Expand Up @@ -168,6 +169,28 @@ def test_to_numpy_zero_copy():
np.testing.assert_array_equal(np_arr, expected)


def test_chunked_array_to_numpy_zero_copy():
elements = [[2, 2, 4], [4, 5, 100]]

chunked_arr = pa.chunked_array(elements)

msg = "zero_copy_only must be False for pyarrow.ChunkedArray.to_numpy"

with pytest.raises(ValueError, match=msg):
chunked_arr.to_numpy(zero_copy_only=True)

np_arr = chunked_arr.to_numpy()

chunked_arr = None
import gc
gc.collect()

# Ensure base is still valid
assert np_arr.base is not None
expected = [2, 2, 4, 4, 5, 100]
np.testing.assert_array_equal(np_arr, expected)


def test_to_numpy_unsupported_types():
# ARROW-2871: Some primitive types are not yet supported in to_numpy
bool_arr = pa.array([True, False, True])
Expand Down Expand Up @@ -517,7 +540,6 @@ def test_struct_array_slice():


def test_array_factory_invalid_type():

class MyObject:
pass

Expand Down Expand Up @@ -733,7 +755,7 @@ def test_struct_array_from_chunked():
@pytest.mark.parametrize("offset", (0, 1))
def test_dictionary_from_buffers(offset):
a = pa.array(["one", "two", "three", "two", "one"]).dictionary_encode()
b = pa.DictionaryArray.from_buffers(a.type, len(a)-offset,
b = pa.DictionaryArray.from_buffers(a.type, len(a) - offset,
a.indices.buffers(), a.dictionary,
offset=offset)
assert a[offset:] == b
Expand Down Expand Up @@ -934,16 +956,16 @@ def test_list_from_arrays(list_array_type, list_type_factory):


@pytest.mark.parametrize(('list_array_type', 'list_type_factory'), (
(pa.ListArray, pa.list_),
(pa.LargeListArray, pa.large_list)
(pa.ListArray, pa.list_),
(pa.LargeListArray, pa.large_list)
))
@pytest.mark.parametrize("arr", (
[None, [0]],
[None, [0, None], [0]],
[[0], [1]],
[None, [0]],
[None, [0, None], [0]],
[[0], [1]],
))
def test_list_array_types_from_arrays(
list_array_type, list_type_factory, arr
list_array_type, list_type_factory, arr
):
arr = pa.array(arr, list_type_factory(pa.int8()))
reconstructed_arr = list_array_type.from_arrays(
Expand All @@ -952,8 +974,8 @@ def test_list_array_types_from_arrays(


@pytest.mark.parametrize(('list_array_type', 'list_type_factory'), (
(pa.ListArray, pa.list_),
(pa.LargeListArray, pa.large_list)
(pa.ListArray, pa.list_),
(pa.LargeListArray, pa.large_list)
))
def test_list_array_types_from_arrays_fail(list_array_type, list_type_factory):
# Fail when manual offsets include nulls and mask passed
Expand Down Expand Up @@ -1400,7 +1422,6 @@ def test_cast_chunked_array_empty():
# ARROW-8142
for typ1, typ2 in [(pa.dictionary(pa.int8(), pa.string()), pa.string()),
(pa.int64(), pa.int32())]:

arr = pa.chunked_array([], type=typ1)
result = arr.cast(typ2)
expected = pa.chunked_array([], type=typ2)
Expand Down Expand Up @@ -1971,7 +1992,7 @@ def test_cast_identities(ty, values):
([[4, 5], [6]], pa.large_list(pa.int16())),
([['a'], None, ['b', 'c']], pa.list_(pa.string())),
([(1, 'a'), (2, 'c'), None],
pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
]
)

Expand Down Expand Up @@ -2107,6 +2128,7 @@ def _check_case(ty):
arr = pa.array(['string', np.nan], type=ty, from_pandas=True)
expected = pa.array(['string', None], type=ty)
assert arr.equals(expected)

_check_case('binary')
_check_case('utf8')

Expand Down Expand Up @@ -2350,9 +2372,9 @@ def test_interval_array_from_relativedelta():
assert arr.to_pandas().tolist() == [
None, DateOffset(months=13, days=8,
microseconds=(
datetime.timedelta(seconds=1, microseconds=1,
minutes=1, hours=1) //
datetime.timedelta(microseconds=1)),
datetime.timedelta(seconds=1, microseconds=1,
minutes=1, hours=1) //
datetime.timedelta(microseconds=1)),
nanoseconds=0)]
with pytest.raises(ValueError):
pa.array([DateOffset(years=((1 << 32) // 12), months=100)])
Expand Down Expand Up @@ -2402,9 +2424,9 @@ def test_interval_array_from_dateoffset():
expected_from_pandas = [
None, DateOffset(months=13, days=8,
microseconds=(
datetime.timedelta(seconds=1, microseconds=1,
minutes=1, hours=1) //
datetime.timedelta(microseconds=1)),
datetime.timedelta(seconds=1, microseconds=1,
minutes=1, hours=1) //
datetime.timedelta(microseconds=1)),
nanoseconds=1),
DateOffset(months=0, days=0, microseconds=0, nanoseconds=0)]

Expand Down Expand Up @@ -2513,7 +2535,7 @@ def test_array_from_strided():
([b"ab", b"cd", b"ef"], (pa.binary(), pa.binary(2))),
([1, 2, 3], (pa.int8(), pa.int16(), pa.int32(), pa.int64())),
([1.0, 2.0, 3.0], (pa.float32(), pa.float64())),
(["ab", "cd", "ef"], (pa.utf8(), ))
(["ab", "cd", "ef"], (pa.utf8(),))
]

for values, dtypes in pydata:
Expand Down Expand Up @@ -2610,8 +2632,8 @@ def test_total_buffer_size():
assert a.get_total_buffer_size() == 8 * 3
assert sys.getsizeof(a) >= object.__sizeof__(a) + a.nbytes
a = pa.array([1, None, 3], type='int64')
assert a.nbytes == 8*3 + 1
assert a.get_total_buffer_size() == 8*3 + 1
assert a.nbytes == 8 * 3 + 1
assert a.get_total_buffer_size() == 8 * 3 + 1
assert sys.getsizeof(a) >= object.__sizeof__(a) + a.nbytes
a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64()))
assert a.nbytes == 62
Expand Down Expand Up @@ -2721,7 +2743,6 @@ def test_list_value_parent_indices(list_type):
(pa.int32(), pa.list_(pa.int32(), list_size=2)),
(pa.int64(), pa.large_list(pa.int32()))])
def test_list_value_lengths(offset_type, list_type):

# FixedSizeListArray needs fixed list sizes
if getattr(list_type, "list_size", None):
arr = pa.array(
Expand Down Expand Up @@ -3066,7 +3087,7 @@ def test_numpy_binary_overflow_to_chunked():
@pytest.mark.large_memory
def test_list_child_overflow_to_chunked():
kilobyte_string = 'x' * 1024
two_mega = 2**21
two_mega = 2 ** 21

vals = [[kilobyte_string]] * (two_mega - 1)
arr = pa.array(vals)
Expand Down Expand Up @@ -3191,11 +3212,11 @@ def test_binary_array_masked():
assert [None] == masked_nulls.to_pylist()

# Fixed Length Binary, copy
npa = np.array([b'aaa', b'bbb', b'ccc']*10)
npa = np.array([b'aaa', b'bbb', b'ccc'] * 10)
arrow_array = pa.array(npa, type=pa.binary(3),
mask=np.array([False, False, False]*10))
mask=np.array([False, False, False] * 10))
npa[npa == b"bbb"] = b"XXX"
assert ([b'aaa', b'bbb', b'ccc']*10) == arrow_array.to_pylist()
assert ([b'aaa', b'bbb', b'ccc'] * 10) == arrow_array.to_pylist()


def test_binary_array_strided():
Expand Down Expand Up @@ -3245,7 +3266,6 @@ def test_array_from_large_pyints():


def test_array_protocol():

class MyArray:
def __init__(self, data):
self.data = data
Expand Down

0 comments on commit 44087a1

Please sign in to comment.