apacheGH-34787: [Python] Accept zero_copy_only=False for ChunkedArray…

….to_numpy Signed-off-by: Julien Jerphanion <git@jjerphan.xyz>
jjerphan · May 13, 2023 · 44087a1 · 44087a1
1 parent cd6e2a4
commit 44087a1
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 28 deletions.
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
@@ -471,10 +471,16 @@ cdef class ChunkedArray(_PandasConvertible):
 
         return _array_like_to_pandas(self, options, types_mapper=types_mapper)
 
-    def to_numpy(self):
+    def to_numpy(self, zero_copy_only=False):
         """
         Return a NumPy copy of this array (experimental).
 
+        Parameters
+        ----------
+        zero_copy_only : bool, default False
+            Introduced for signature consistence with pyarrow.Array.to_numpy.
+            This must be False here since NumPy arrays' buffer must be contiguous.
+
         Returns
         -------
         array : numpy.ndarray
@@ -491,6 +497,14 @@ cdef class ChunkedArray(_PandasConvertible):
             PandasOptions c_options
             object values
 
+        if zero_copy_only:
+            raise ValueError(
+                "zero_copy_only must be False for pyarrow.ChunkedArray.to_numpy"
+            )
+
+        # c_options.decode_dictionaries = False
+        # c_options.zero_copy_only = False
+
         with nogil:
             check_status(
                 ConvertChunkedArrayToPandas(

diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
@@ -28,6 +28,7 @@
 import weakref
 
 import numpy as np
+
 try:
     import pickle5
 except ImportError:
@@ -168,6 +169,28 @@ def test_to_numpy_zero_copy():
     np.testing.assert_array_equal(np_arr, expected)
 
 
+def test_chunked_array_to_numpy_zero_copy():
+    elements = [[2, 2, 4], [4, 5, 100]]
+
+    chunked_arr = pa.chunked_array(elements)
+
+    msg = "zero_copy_only must be False for pyarrow.ChunkedArray.to_numpy"
+
+    with pytest.raises(ValueError, match=msg):
+        chunked_arr.to_numpy(zero_copy_only=True)
+
+    np_arr = chunked_arr.to_numpy()
+
+    chunked_arr = None
+    import gc
+    gc.collect()
+
+    # Ensure base is still valid
+    assert np_arr.base is not None
+    expected = [2, 2, 4, 4, 5, 100]
+    np.testing.assert_array_equal(np_arr, expected)
+
+
 def test_to_numpy_unsupported_types():
     # ARROW-2871: Some primitive types are not yet supported in to_numpy
     bool_arr = pa.array([True, False, True])
@@ -517,7 +540,6 @@ def test_struct_array_slice():
 
 
 def test_array_factory_invalid_type():
-
     class MyObject:
         pass
 
@@ -733,7 +755,7 @@ def test_struct_array_from_chunked():
 @pytest.mark.parametrize("offset", (0, 1))
 def test_dictionary_from_buffers(offset):
     a = pa.array(["one", "two", "three", "two", "one"]).dictionary_encode()
-    b = pa.DictionaryArray.from_buffers(a.type, len(a)-offset,
+    b = pa.DictionaryArray.from_buffers(a.type, len(a) - offset,
                                         a.indices.buffers(), a.dictionary,
                                         offset=offset)
     assert a[offset:] == b
@@ -934,16 +956,16 @@ def test_list_from_arrays(list_array_type, list_type_factory):
 
 
 @pytest.mark.parametrize(('list_array_type', 'list_type_factory'), (
-    (pa.ListArray, pa.list_),
-    (pa.LargeListArray, pa.large_list)
+        (pa.ListArray, pa.list_),
+        (pa.LargeListArray, pa.large_list)
 ))
 @pytest.mark.parametrize("arr", (
-    [None, [0]],
-    [None, [0, None], [0]],
-    [[0], [1]],
+        [None, [0]],
+        [None, [0, None], [0]],
+        [[0], [1]],
 ))
 def test_list_array_types_from_arrays(
-    list_array_type, list_type_factory, arr
+        list_array_type, list_type_factory, arr
 ):
     arr = pa.array(arr, list_type_factory(pa.int8()))
     reconstructed_arr = list_array_type.from_arrays(
@@ -952,8 +974,8 @@ def test_list_array_types_from_arrays(
 
 
 @pytest.mark.parametrize(('list_array_type', 'list_type_factory'), (
-    (pa.ListArray, pa.list_),
-    (pa.LargeListArray, pa.large_list)
+        (pa.ListArray, pa.list_),
+        (pa.LargeListArray, pa.large_list)
 ))
 def test_list_array_types_from_arrays_fail(list_array_type, list_type_factory):
     # Fail when manual offsets include nulls and mask passed
@@ -1400,7 +1422,6 @@ def test_cast_chunked_array_empty():
     # ARROW-8142
     for typ1, typ2 in [(pa.dictionary(pa.int8(), pa.string()), pa.string()),
                        (pa.int64(), pa.int32())]:
-
         arr = pa.chunked_array([], type=typ1)
         result = arr.cast(typ2)
         expected = pa.chunked_array([], type=typ2)
@@ -1971,7 +1992,7 @@ def test_cast_identities(ty, values):
         ([[4, 5], [6]], pa.large_list(pa.int16())),
         ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
         ([(1, 'a'), (2, 'c'), None],
-            pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
+         pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
     ]
 )
 
@@ -2107,6 +2128,7 @@ def _check_case(ty):
         arr = pa.array(['string', np.nan], type=ty, from_pandas=True)
         expected = pa.array(['string', None], type=ty)
         assert arr.equals(expected)
+
     _check_case('binary')
     _check_case('utf8')
 
@@ -2350,9 +2372,9 @@ def test_interval_array_from_relativedelta():
     assert arr.to_pandas().tolist() == [
         None, DateOffset(months=13, days=8,
                          microseconds=(
-                             datetime.timedelta(seconds=1, microseconds=1,
-                                                minutes=1, hours=1) //
-                             datetime.timedelta(microseconds=1)),
+                                 datetime.timedelta(seconds=1, microseconds=1,
+                                                    minutes=1, hours=1) //
+                                 datetime.timedelta(microseconds=1)),
                          nanoseconds=0)]
     with pytest.raises(ValueError):
         pa.array([DateOffset(years=((1 << 32) // 12), months=100)])
@@ -2402,9 +2424,9 @@ def test_interval_array_from_dateoffset():
     expected_from_pandas = [
         None, DateOffset(months=13, days=8,
                          microseconds=(
-                             datetime.timedelta(seconds=1, microseconds=1,
-                                                minutes=1, hours=1) //
-                             datetime.timedelta(microseconds=1)),
+                                 datetime.timedelta(seconds=1, microseconds=1,
+                                                    minutes=1, hours=1) //
+                                 datetime.timedelta(microseconds=1)),
                          nanoseconds=1),
         DateOffset(months=0, days=0, microseconds=0, nanoseconds=0)]
 
@@ -2513,7 +2535,7 @@ def test_array_from_strided():
         ([b"ab", b"cd", b"ef"], (pa.binary(), pa.binary(2))),
         ([1, 2, 3], (pa.int8(), pa.int16(), pa.int32(), pa.int64())),
         ([1.0, 2.0, 3.0], (pa.float32(), pa.float64())),
-        (["ab", "cd", "ef"], (pa.utf8(), ))
+        (["ab", "cd", "ef"], (pa.utf8(),))
     ]
 
     for values, dtypes in pydata:
@@ -2610,8 +2632,8 @@ def test_total_buffer_size():
     assert a.get_total_buffer_size() == 8 * 3
     assert sys.getsizeof(a) >= object.__sizeof__(a) + a.nbytes
     a = pa.array([1, None, 3], type='int64')
-    assert a.nbytes == 8*3 + 1
-    assert a.get_total_buffer_size() == 8*3 + 1
+    assert a.nbytes == 8 * 3 + 1
+    assert a.get_total_buffer_size() == 8 * 3 + 1
     assert sys.getsizeof(a) >= object.__sizeof__(a) + a.nbytes
     a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64()))
     assert a.nbytes == 62
@@ -2721,7 +2743,6 @@ def test_list_value_parent_indices(list_type):
                           (pa.int32(), pa.list_(pa.int32(), list_size=2)),
                           (pa.int64(), pa.large_list(pa.int32()))])
 def test_list_value_lengths(offset_type, list_type):
-
     # FixedSizeListArray needs fixed list sizes
     if getattr(list_type, "list_size", None):
         arr = pa.array(
@@ -3066,7 +3087,7 @@ def test_numpy_binary_overflow_to_chunked():
 @pytest.mark.large_memory
 def test_list_child_overflow_to_chunked():
     kilobyte_string = 'x' * 1024
-    two_mega = 2**21
+    two_mega = 2 ** 21
 
     vals = [[kilobyte_string]] * (two_mega - 1)
     arr = pa.array(vals)
@@ -3191,11 +3212,11 @@ def test_binary_array_masked():
     assert [None] == masked_nulls.to_pylist()
 
     # Fixed Length Binary, copy
-    npa = np.array([b'aaa', b'bbb', b'ccc']*10)
+    npa = np.array([b'aaa', b'bbb', b'ccc'] * 10)
     arrow_array = pa.array(npa, type=pa.binary(3),
-                           mask=np.array([False, False, False]*10))
+                           mask=np.array([False, False, False] * 10))
     npa[npa == b"bbb"] = b"XXX"
-    assert ([b'aaa', b'bbb', b'ccc']*10) == arrow_array.to_pylist()
+    assert ([b'aaa', b'bbb', b'ccc'] * 10) == arrow_array.to_pylist()
 
 
 def test_binary_array_strided():
@@ -3245,7 +3266,6 @@ def test_array_from_large_pyints():
 
 
 def test_array_protocol():
-
     class MyArray:
         def __init__(self, data):
             self.data = data