From 6c313752fdf835d27383cc403111e06ed504e210 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 19 Nov 2018 13:36:03 -0500 Subject: [PATCH 01/13] Include config.pxi with some needed constants Defines some things as preprocessor constants to ensure Cython/C take these into account as part of compilation. The intent being to avoid unneeded branches of code. --- .gitignore | 1 + setup.py | 5 +++++ src/cybuffer.pyx | 1 + 3 files changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index 50b51ab..fed9036 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__/ # C extensions *.so src/cybuffer.c +src/config.pxi src/version.pxi # Distribution / packaging diff --git a/setup.py b/setup.py index 3887ab4..963c837 100644 --- a/setup.py +++ b/setup.py @@ -62,6 +62,11 @@ def run_tests(self): any([v.startswith("install") for v in sys.argv])): setup_requirements = [] else: + with open("src/config.pxi", "w") as f: + f.writelines([ + "DEF PY2K = " + str(sys.version_info.major == 2) + "\n", + "DEF PY3K = " + str(sys.version_info.major == 3) + "\n" + ]) with open("src/version.pxi", "w") as f: f.writelines([ "__version__ = " + "\"" + str(version) + "\"" diff --git a/src/cybuffer.pyx b/src/cybuffer.pyx index 6fa8306..17f65fa 100644 --- a/src/cybuffer.pyx +++ b/src/cybuffer.pyx @@ -1,3 +1,4 @@ +include "config.pxi" cimport cybuffer From 9083f75bc8d53901c6771f49968f42c4ae1884af Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 19 Nov 2018 13:36:04 -0500 Subject: [PATCH 02/13] Create `cybuffer` extension type Implements the `cybuffer` extension type. This supports Python's old buffer protocol (Python 2 only) and the (new) buffer protocol (Python 2+). The value being that this can be comfortably passed in anywhere either buffer protocol is needed. As some Python 2 functions (e.g. compressors, hashes, etc.) only support the old buffer protocol, this should work nicely with them. In other cases a Python 2 function may well with either protocols or have a slight preference for one or the other. As this supports both buffer protocols, it will coerce easily to whichever one works best for the algorithm. In Python 3 only the (new) buffer protocol is supported, which this works fine with as well. This works by trying to coerce the input data provided into the (new) buffer protocol falling back to the old one when nothing else works. In either case, it takes a `memoryview` of the resulting coercion. This standardizes the view of the data between the two interfaces. Also provides some information that is needed for supporting the (new) buffer protocol. Of course this information can easily be repurposed to support the old buffer protocol. As the `memoryview` manages all underlying allocated buffers, there is no need to do any special handling when our object scopes out as the `memoryview` will clean that up on the way out. Generally when exposing the data to the (new) buffer protocol, we just copy over the `Py_buffer` object held by the `memoryview` for the consumer. Though we do replace `obj` to point to our `self` instead to ensure our object doesn't scope out. Otherwise our object and the `memoryview` could get deallocated resulting in segfaults and other unpleasant outcomes. However sometimes we have to do a little more to make the buffer more user friendly. For instance if a builtin `array` object is provided as data, we have to handle some peculiarities of it. In particular the builtin `array` supports a `unicode` type (i.e. `Py_UNICODE`), which has been a bit broken and doesn't necessarily work well with other (new) buffer protocol consumers (e.g. NumPy). Specifically the `unicode` type may be `Py_UCS4` or `Py_UCS2` depending on how Python was built. The former is supported by NumPy, but the latter is not. While this type has been deprecated by Python, it still exists in Python 2 and 3 (though will likely be pulled out in Python 4). So we try to support it as best we can by using the same strategy `memoryview`s already use with `bytes`. Namely casting it to an unsigned integer type. Thus we cast `Py_UCS2` to `uint16_t` and `Py_UCS4` to `uint32_t`. These can easily be handled by NumPy or any other consumer of `memoryview`s. Also its consistent with `cybuffer` being a data container as opposed to some way to manipulate text encodings. On Python 2 we have the added problem that builtin `array`s do not support the (new) buffer protocol. However creating the `memoryview` through the old buffer protocol results in a loss of type information as it represents everything as unsigned bytes. To fix this, we patch the copied buffer to include information about the type of `array` we were provided. Specifically this means we update the `format` string, `itemsize`, `shape`, and `strides` set in the `Py_buffer` for the consumer. This patching is relatively fast as most of the operations happen at the C level with very minimal usage of Python objects or the Python API. As a result, `array` types can be passed into `cybuffer` and be used on Python 2/3 as if they supported the (new) buffer protocol on both platforms. As all information is available from the `memoryview`, there really is no issue filling out that information into the user provided buffer. It's just a matter of making sure we don't accidentally write out components that were not requested. This would be particularly bad for some pointers that might not be allocated. So we check the flags to see which pieces should be written out and only write those parts. The rest are set to NULL. Note this has no effect on the non-pointer types, which we write out anyways. All of those are required regardless anyways. Also we have no internal content that needs to be kept around. So that field is set to NULL as well. When it comes to the other requirements like in what ways the data needs to be contiguous and whether it is writable, we have no real control over that as we are not maintaining the buffer ourselves. So instead we simply check to see if the data we have meets the user's requests. If it does, then we proceed to filling out the `Py_buffer` object. However if any of these doesn't match their expectations, we raise a `BufferError` as the (new) buffer protocol specifies. This may trigger copies downstream depending on what exactly was requested in how downstream buffers want to handle it. Exposing the old buffer protocol involves using the legacy Cython dunder methods. These pretty much just expose the `memoryview`s buffer to the consumer after some basic checks. All of them require that the array is contiguous. If that isn't upheld, they raise. While it may be possible to handle some of the more advanced slicing cases using multiple segments, this becomes rather complex and raises various questions that need to be answered. For what it is worth NumPy seems to agree with this sentiment as they did not implement them either. More accurately they have a comment that states it was removed, which appears to predate SciPy! In fact it is likely the weakness of the old buffer interface in this regard that gave way to the new. In order to not only provide a reasonably transportable container of data for the different buffer protocols, but also to provide some basic level of functionality, expose some of the internal state as `readonly` attributes for others to inspect the metadata. Where attributes do not suffice, provide small methods exposed as properties to coerce C types into things familiar to Python programmers (e.g. C pointer of shape to a Python tuple). To allow typical methods to work more nicely, provide a few dunder methods for use with Python to access things like length. These all line up with their names on the `memoryview` interface of Python 2 and Python 3. Though should expose a few more than what Python 2's `memoryview` has out-of-the-box. As some cases need another simple Python native representation of the binary data, implement the `tobytes` method. As our type patching does not effect the underlying buffer, we simply rely on our `memoryview` of the data type unpatched so as to get access to its `Py_buffer` for use with the (new) buffer protocol. Then we allocate a `bytes` object of the right size that is otherwise empty and get access to its underlying C representation as a `char*` so that we can fill it. Using `PyBuffer_ToContiguous`, we perform a direct copy from the `memoryview`'s buffer into the `bytes` object. Returning the filled `bytes` object to the user. The buffer protocol is able to handle any striding we throw at it without generating intermediates making this very fast. As the whole thing uses the C API, there is very little overhead from CPython other than the usual reference counting. This ends up happening at a comparable speed to NumPy's own `tobytes`, which may in fact use the same mechanism. --- src/cybuffer.pxd | 15 +++ src/cybuffer.pyx | 300 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 315 insertions(+) diff --git a/src/cybuffer.pxd b/src/cybuffer.pxd index 8b13789..970cec4 100644 --- a/src/cybuffer.pxd +++ b/src/cybuffer.pxd @@ -1 +1,16 @@ +cdef class cybuffer(object): + cdef readonly object obj + cdef Py_buffer _buf + + cdef char* _format + cdef readonly Py_ssize_t itemsize + + cdef Py_ssize_t* _shape + cdef Py_ssize_t* _strides + + cdef readonly bint c_contiguous + cdef readonly bint f_contiguous + cdef readonly bint contiguous + + cpdef bytes tobytes(self) diff --git a/src/cybuffer.pyx b/src/cybuffer.pyx index 17f65fa..b1d01e1 100644 --- a/src/cybuffer.pyx +++ b/src/cybuffer.pyx @@ -2,4 +2,304 @@ include "config.pxi" cimport cybuffer +cimport cython + +cimport cpython.buffer +cimport cpython.bytes +cimport cpython.mem +cimport cpython.oldbuffer +cimport cpython.tuple + +from cpython.array cimport array +from cpython.buffer cimport Py_buffer +from cpython.buffer cimport ( + PyBUF_FORMAT, PyBUF_WRITABLE, + PyBUF_ND, PyBUF_STRIDES, PyBUF_INDIRECT, + PyBUF_C_CONTIGUOUS, PyBUF_F_CONTIGUOUS, PyBUF_ANY_CONTIGUOUS, + PyBUF_FULL_RO +) + +from array import array + include "version.pxi" + + +cdef extern from "Python.h": + size_t Py_UNICODE_SIZE + + object PyMemoryView_FromObject(object obj) + + +cdef extern from *: + """ + #define UBYTE_TC "B" + #define UCS2_TC "H" + #define UCS4_TC "I" + + #define PyTuple_SET_ITEM_INC(l, i, o) \ + Py_INCREF(o); PyTuple_SET_ITEM(l, i, o) + """ + + char* UBYTE_TC + char* UCS2_TC + char* UCS4_TC + + void PyTuple_SET_ITEM_INC(object, Py_ssize_t, object) + + +cdef tuple pointer_to_tuple(int n, Py_ssize_t* p): + cdef int i + cdef object p_i + cdef tuple result + + result = cpython.tuple.PyTuple_New(n) + for i in range(n): + p_i = long(p[i]) + PyTuple_SET_ITEM_INC(result, i, p_i) + + return result + + +cdef class cybuffer(object): + """ + Constructs a ``memoryview`` from the buffer exposed by ``data`` + + Attempts to use the (new) buffer interface. Falls back to the + old buffer interface on Python 2 if that does not work. Smooths + over some type handling issues of builtin types as needed. + """ + + + @cython.cdivision(True) + def __cinit__(self, data): + """ + Take a memoryview of the data and hold onto it. + """ + + self.obj = data + + cdef object data_buf + if cpython.buffer.PyObject_CheckBuffer(data): + data_buf = data + elif PY2K: + try: + data_buf = cpython.oldbuffer.PyBuffer_FromReadWriteObject( + data, 0, -1 + ) + except TypeError: + data_buf = cpython.oldbuffer.PyBuffer_FromObject(data, 0, -1) + else: + raise TypeError("Unable to get buffer protocol API for `data`.") + + # Create a buffer based on memoryview + data_buf = PyMemoryView_FromObject(data_buf) + cpython.buffer.PyObject_GetBuffer(data_buf, &self._buf, PyBUF_FULL_RO) + + # Allocate and/or initialize metadata for casting + self._format = self._buf.format + self.itemsize = self._buf.itemsize + self._shape = self._buf.shape + self._strides = self._buf.strides + + # Figure out whether the memoryview is contiguous + self.c_contiguous = cpython.buffer.PyBuffer_IsContiguous( + &self._buf, b'C' + ) + self.f_contiguous = cpython.buffer.PyBuffer_IsContiguous( + &self._buf, b'F' + ) + self.contiguous = self.c_contiguous or self.f_contiguous + + # Workaround some special cases with the builtin array + cdef size_t len_nd_b + cdef int n_1 + if isinstance(data, array): + # Fix-up typecode + typecode = data.typecode + if typecode == "B": + return + elif PY2K and typecode == "c": + self._format = UBYTE_TC + return + elif (PY2K or PY3K) and typecode == "u": + if Py_UNICODE_SIZE == 2: + self._format = UCS2_TC + elif Py_UNICODE_SIZE == 4: + self._format = UCS4_TC + elif PY2K: + self._format = typecode + + # Adjust itemsize, shape, and strides based on casting + if PY2K: + self.itemsize = data.itemsize + + len_nd_b = self._buf.ndim * sizeof(Py_ssize_t) + self._shape = cpython.mem.PyMem_Malloc(len_nd_b) + self._strides = cpython.mem.PyMem_Malloc(len_nd_b) + + n_1 = self._buf.ndim - 1 + self._shape[n_1] = self._buf.shape[n_1] // self.itemsize + self._strides[n_1] = self._buf.strides[n_1] * self.itemsize + + + def __dealloc__(self): + if self._shape != self._buf.shape: + cpython.mem.PyMem_Free(self._shape) + if self._strides != self._buf.strides: + cpython.mem.PyMem_Free(self._strides) + + cpython.buffer.PyBuffer_Release(&self._buf) + + self._format = NULL + self._shape = NULL + self._strides = NULL + + + @property + def readonly(self): + return self._buf.readonly + + + @property + def format(self): + cdef bytes _format = self._format + + if PY2K: + return _format + else: + return _format.decode("ascii") + + + @property + def ndim(self): + return self._buf.ndim + + + @property + def nbytes(self): + return self._buf.len + + + @property + def shape(self): + return pointer_to_tuple(self._buf.ndim, self._shape) + + + @property + def strides(self): + return pointer_to_tuple(self._buf.ndim, self._strides) + + + @property + def suboffsets(self): + cdef tuple r + if self._buf.suboffsets is NULL: + r = tuple() + else: + r = pointer_to_tuple(self._buf.ndim, self._buf.suboffsets) + return r + + + def __len__(self): + return self._shape[0] + + + cpdef bytes tobytes(self): + cdef bytes r + cdef char* s + + r = cpython.bytes.PyBytes_FromStringAndSize(NULL, self._buf.len) + s = cpython.bytes.PyBytes_AS_STRING(r) + + cpython.buffer.PyBuffer_ToContiguous( + s, &self._buf, self._buf.len, b'C' + ) + + return r + + + def __getbuffer__(self, Py_buffer* buf, int flags): + if (flags & PyBUF_ANY_CONTIGUOUS) == PyBUF_ANY_CONTIGUOUS: + if not self.contiguous: + raise BufferError("data is not contiguous") + if (flags & PyBUF_C_CONTIGUOUS) == PyBUF_C_CONTIGUOUS: + if not self.c_contiguous: + raise BufferError("data is not C contiguous") + if (flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS: + if not self.f_contiguous: + raise BufferError("data is not F contiguous") + + if (flags & PyBUF_WRITABLE) == PyBUF_WRITABLE: + if self._buf.readonly: + raise BufferError("data is readonly") + + buf.buf = self._buf.buf + buf.obj = self + buf.len = self._buf.len + buf.readonly = self._buf.readonly + buf.itemsize = self.itemsize + buf.ndim = self._buf.ndim + buf.internal = NULL + + if (flags & PyBUF_FORMAT) == PyBUF_FORMAT: + buf.format = self._format + else: + buf.format = NULL + + if (flags & PyBUF_ND) == PyBUF_ND: + buf.shape = self._shape + else: + buf.shape = NULL + + if (flags & PyBUF_STRIDES) == PyBUF_STRIDES: + buf.strides = self._strides + else: + buf.strides = NULL + + if (flags & PyBUF_INDIRECT) == PyBUF_INDIRECT: + buf.suboffsets = self._buf.suboffsets + else: + buf.suboffsets = NULL + + + def __releasebuffer__(self, Py_buffer* buf): + pass + + + def __getreadbuffer__(self, Py_ssize_t i, void** p): + if i != 0: + raise ValueError("Accessing non-existent segment") + if not self.contiguous: + raise ValueError("Data is not contiguous") + + p[0] = self._buf.buf + + return self._buf.len + + + def __getwritebuffer__(self, Py_ssize_t i, void** p): + if i != 0: + raise ValueError("Accessing non-existent segment") + if not self.contiguous: + raise ValueError("Data is not contiguous") + if self._buf.readonly: + raise TypeError("Buffer is read-only") + + p[0] = self._buf.buf + + return self._buf.len + + + def __getsegcount__(self, Py_ssize_t* p): + return 1 + + + def __getcharbuffer__(self, Py_ssize_t i, char** p): + if i != 0: + raise ValueError("Accessing non-existent segment") + if not self.contiguous: + raise ValueError("Data is not contiguous") + + p[0] = self._buf.buf + + return self._buf.len From 77e04e2204297997f58e8a25318eb797bea6aa11 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 19 Nov 2018 13:36:05 -0500 Subject: [PATCH 03/13] Use Cython `memoryview` to implement get/set item Adds a little bit of code to leverage Cython's `memoryview` objects to enable indexing the `cybuffer` object to get a new one on a restricted view or to set values in it. In both cases, leverage the buffer protocol to get a Cython `memoryview` onto our data. Thus it applies our castings as well. Cython's `memoryview` is pretty capable as it allows us to make subselections onto the data. In get item, construct a new `cybuffer` around Cython's `memoryview` when returning these subselections. Thus users just see new `cybuffer`s and the Cython `memoryview` is an implementation detail. In set item, just create a Cython `memoryview` of our data and use their set item to assign the value. Should handle the casting we apply nicely. --- src/cybuffer.pyx | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/cybuffer.pyx b/src/cybuffer.pyx index b1d01e1..e04e1c9 100644 --- a/src/cybuffer.pyx +++ b/src/cybuffer.pyx @@ -3,6 +3,9 @@ include "config.pxi" cimport cybuffer cimport cython +cimport cython.view + +from cython.view cimport memoryview as cvmemoryview cimport cpython.buffer cimport cpython.bytes @@ -204,6 +207,22 @@ cdef class cybuffer(object): return self._shape[0] + def __getitem__(self, key): + cdef object r + cdef cvmemoryview mv = cvmemoryview(self, PyBUF_FULL_RO) + + r = mv[key] + if isinstance(r, cvmemoryview): + r = cybuffer(r) + + return r + + + def __setitem__(self, key, value): + cdef cvmemoryview mv = cvmemoryview(self, PyBUF_FULL_RO) + mv[key] = value + + cpdef bytes tobytes(self): cdef bytes r cdef char* s From 67ec14faa64b34d8a976749e10a7648886d8bcf6 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 19 Nov 2018 13:36:06 -0500 Subject: [PATCH 04/13] Implement tolist Adds an implementation of `tolist` for `cybuffer`. Uses a recursive helper function in basically straight C to build lists of values, which are then reinserted into other enclosing lists. Makes use of Cython's `memoryview` instance to handle the get item calls as we recurse down. Preallocates all lists with `NULL` values and sets them with actual objects using Python's C API to make the list construction faster. Requires use of increment reference calls to handle reference stealing in the context of Cython's automatic reference counting. This is slower than NumPy's implementation. That said, building large Python lists of data is probably the wrong way to go. Should be fine for getting a rough idea of what's in the memory shared with `cybuffer` without resorting to other libraries. --- src/cybuffer.pxd | 1 + src/cybuffer.pyx | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/src/cybuffer.pxd b/src/cybuffer.pxd index 970cec4..cf68ce5 100644 --- a/src/cybuffer.pxd +++ b/src/cybuffer.pxd @@ -14,3 +14,4 @@ cdef class cybuffer(object): cdef readonly bint contiguous cpdef bytes tobytes(self) + cpdef list tolist(self) diff --git a/src/cybuffer.pyx b/src/cybuffer.pyx index e04e1c9..7dbb48c 100644 --- a/src/cybuffer.pyx +++ b/src/cybuffer.pyx @@ -9,6 +9,7 @@ from cython.view cimport memoryview as cvmemoryview cimport cpython.buffer cimport cpython.bytes +cimport cpython.list cimport cpython.mem cimport cpython.oldbuffer cimport cpython.tuple @@ -39,6 +40,8 @@ cdef extern from *: #define UCS2_TC "H" #define UCS4_TC "I" + #define PyList_SET_ITEM_INC(l, i, o) \ + Py_INCREF(o); PyList_SET_ITEM(l, i, o) #define PyTuple_SET_ITEM_INC(l, i, o) \ Py_INCREF(o); PyTuple_SET_ITEM(l, i, o) """ @@ -47,6 +50,7 @@ cdef extern from *: char* UCS2_TC char* UCS4_TC + void PyList_SET_ITEM_INC(object, Py_ssize_t, object) void PyTuple_SET_ITEM_INC(object, Py_ssize_t, object) @@ -63,6 +67,32 @@ cdef tuple pointer_to_tuple(int n, Py_ssize_t* p): return result +@cython.boundscheck(False) +@cython.infer_types(True) +@cython.initializedcheck(False) +@cython.nonecheck(False) +@cython.wraparound(False) +cdef list cvmemoryview_to_list(cvmemoryview mv): + cdef list r + cdef cvmemoryview mv_i + cdef object r_i + cdef Py_ssize_t i, n + + n = mv.view.shape[0] + r = cpython.list.PyList_New(n) + if mv.view.ndim > 1: + for i in range(n): + mv_i = mv[i] + r_i = cvmemoryview_to_list(mv_i) + PyList_SET_ITEM_INC(r, i, r_i) + else: + for i in range(n): + r_i = mv[i] + PyList_SET_ITEM_INC(r, i, r_i) + + return r + + cdef class cybuffer(object): """ Constructs a ``memoryview`` from the buffer exposed by ``data`` @@ -237,6 +267,16 @@ cdef class cybuffer(object): return r + cpdef list tolist(self): + cdef list r + cdef cvmemoryview mv + + mv = cvmemoryview(self, PyBUF_FULL_RO) + r = cvmemoryview_to_list(mv) + + return r + + def __getbuffer__(self, Py_buffer* buf, int flags): if (flags & PyBUF_ANY_CONTIGUOUS) == PyBUF_ANY_CONTIGUOUS: if not self.contiguous: From 5620cc09061c85a38d91eb3447f32a902c411416 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 19 Nov 2018 13:36:07 -0500 Subject: [PATCH 05/13] Add some tests of cybuffer Makes sure that `cybuffer` is able to handle some common cases equivalently on Python 2/3. Compare against Python 2/3's `memoryview` to make sure that the various features work equivalently. In cases where Python 2 lacks the added features or deviates, check that it matches Python 3's expected behavior on Python 2. --- tests/test_cybuffer.py | 172 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 167 insertions(+), 5 deletions(-) diff --git a/tests/test_cybuffer.py b/tests/test_cybuffer.py index 13bd1f3..bdde4c6 100644 --- a/tests/test_cybuffer.py +++ b/tests/test_cybuffer.py @@ -3,11 +3,173 @@ from __future__ import absolute_import +import array +import contextlib +import mmap +import sys + import pytest +from cybuffer import cybuffer + + +try: + buffer +except NameError: + buffer = memoryview + + +def test_empty_constructor(): + with pytest.raises(TypeError): + b = cybuffer() + + +def validate_against_memoryview(v, b, m, suboffsets=tuple()): + # Test view properties' data relationships + assert b.obj is v + assert b.nbytes == len(m.tobytes()) + assert b.itemsize == (len(m.tobytes()) // len(v)) + assert b.ndim == m.ndim + assert b.suboffsets == suboffsets + assert b.shape == (len(v),) + assert b.strides == (len(m.tobytes()) // len(v),) + + # Test Python 3+ properties + if sys.version_info.major > 2: + assert b.obj is m.obj + assert b.c_contiguous == m.c_contiguous + assert b.f_contiguous == m.f_contiguous + assert b.contiguous == m.contiguous + assert b.nbytes == m.nbytes + + # Test methods + assert b.tobytes() == m.tobytes() + + +@pytest.mark.parametrize("v", [ + b"abcdefghi", + bytearray(b"abcdefghi"), +]) +def test_bytes(v): + # Initialize buffers + b = cybuffer(v) + m = memoryview(v) + + # Validate format + assert b.format == m.format + assert b.itemsize == m.itemsize + + # Validate contiguity + assert b.c_contiguous + assert b.f_contiguous + assert b.contiguous + + # Validate permissions + assert b.readonly == m.readonly + + # Test methods + assert b.tolist() == m.tolist() + + validate_against_memoryview(v, b, m) + + +@pytest.mark.parametrize("f", + ["b", "B", "h", "H", "i", "I", "l", "L", "q", "Q", "f", "d"] +) +def test_1d_arrays(f): + # Skip some newer types + if sys.version_info.major < 3 and f in "qQ": + pytest.skip("Format `%s` not available on Python 2" % f) + + # Initialize buffers + v = array.array(f, [0, 1, 2, 3, 4]) + b = cybuffer(v) + m = memoryview(buffer(v)) + + # Validate format + assert b.format == v.typecode + assert b.itemsize == v.itemsize + + # Validate contiguity + assert b.c_contiguous + assert b.f_contiguous + assert b.contiguous + + # Validate permissions + if isinstance(b, memoryview): + assert b.readonly + else: + assert not b.readonly + + # Test methods + assert b.tolist() == v.tolist() + + validate_against_memoryview(v, b, m) + + +@pytest.mark.parametrize("f, s", [ + ("c", b"Hello World!"), + ("u", u"Hello World!"), +]) +def test_1d_text_arrays(f, s): + # Skip some newer types + if sys.version_info.major > 2 and f is "c": + pytest.skip("Format `%s` not available on Python 3" % f) + + # Initialize buffers + v = array.array(f, s) + b = cybuffer(v) + m = memoryview(buffer(v)) + + # Validate format + assert b.itemsize == v.itemsize + if f is "u" and sys.maxunicode < 65536: + assert b.format == "H" + elif f is "u" and sys.maxunicode >= 65536: + assert b.format == "I" + elif f is "c": + assert b.format == "B" + + # Validate contiguity + assert b.c_contiguous + assert b.f_contiguous + assert b.contiguous + + # Validate permissions + if isinstance(b, memoryview): + assert b.readonly + else: + assert not b.readonly + + # Test methods + assert b.tolist() == list(map(ord, v)) + + validate_against_memoryview(v, b, m) + + +def test_mmap(): + with contextlib.closing(mmap.mmap(-1, 10, prot=mmap.PROT_WRITE)) as v: + # Initialize buffers + b = cybuffer(v) + m = memoryview(buffer(v)) + + # Validate format + assert b.format == m.format + assert b.itemsize == m.itemsize + + # Validate contiguity + assert b.c_contiguous + assert b.f_contiguous + assert b.contiguous + + # Validate permissions + assert not b.readonly + + # Test methods + assert b.tolist() == m.tolist() + + validate_against_memoryview(v, b, m) -def test_import_toplevel(): - try: - import cybuffer - except ImportError: - pytest.fail("Unable to import `cybuffer`.") + # Cleanup to close memory + del b + del m From a32cdbd1e7d8f06e0368267cb9c9666524eb14c5 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 19 Nov 2018 13:36:08 -0500 Subject: [PATCH 06/13] Make NumPy a test requirement To test some N-D cases, include NumPy as a test dependency. It is not needed at runtime however as this relies only on Python and Cython generated C code that is baked into the binary. --- environment_ci.yml | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/environment_ci.yml b/environment_ci.yml index 33bed30..ff83064 100644 --- a/environment_ci.yml +++ b/environment_ci.yml @@ -9,3 +9,4 @@ dependencies: - coverage==4.5.1 - pytest==3.8.1 - cython==0.28.5 + - numpy==1.15.2 diff --git a/setup.py b/setup.py index 963c837..42592e4 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ def run_tests(self): ] test_requirements = [ + "numpy", "pytest", ] From 9ad76b125b90c8ebca1e4281fe2dde4ff86403f7 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 19 Nov 2018 13:36:09 -0500 Subject: [PATCH 07/13] Test a few NumPy arrays Try generating buffers for a few NumPy arrays of different shapes and different orders. Make sure that the buffer matches what NumPy finds in these cases generally. --- tests/test_cybuffer.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/test_cybuffer.py b/tests/test_cybuffer.py index bdde4c6..8ff3e12 100644 --- a/tests/test_cybuffer.py +++ b/tests/test_cybuffer.py @@ -8,6 +8,7 @@ import mmap import sys +import numpy import pytest from cybuffer import cybuffer @@ -173,3 +174,41 @@ def test_mmap(): # Cleanup to close memory del b del m + + +@pytest.mark.parametrize("s", + [(10,), (10, 11), (10, 11, 12)] +) +@pytest.mark.parametrize("o", + ["C", "F"] +) +def test_nd_numpy_arrays(s, o): + # Initialize buffers + numpy.random.seed(42) + a = numpy.random.random(s).astype(float, order=o) + b = cybuffer(a) + + # Validate identity + assert b.obj is a + + # Validate shape, size, etc. + assert b.nbytes == a.nbytes + assert b.ndim == a.ndim + assert b.suboffsets == tuple() + assert b.shape == a.shape + assert b.strides == a.strides + + # Validate format + assert b.format == a.dtype.char + assert b.itemsize == a.itemsize + + # Validate contiguity + assert b.c_contiguous == a.flags.c_contiguous + assert b.f_contiguous == a.flags.f_contiguous + assert b.contiguous == (a.flags.c_contiguous or a.flags.f_contiguous) + + # Validate permissions + assert b.readonly != a.flags.writeable + + # Test methods + assert b.tolist() == a.tolist() From 9095d6e29dc297d5b538810b82f166c21431144e Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 19 Nov 2018 13:36:09 -0500 Subject: [PATCH 08/13] Skip memory deallocation if not Python 2 This memory is only allocated on Python 2 in the special case where a builtin `array` is used and its type is not unsigned char or mappable to it (e.g. `"c"`). As such, we can avoid these checks entirely if we are not running on Python 2. So add a check for Python 2. This will ensure this code block is dropped on Python 3. --- src/cybuffer.pyx | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/cybuffer.pyx b/src/cybuffer.pyx index 7dbb48c..db1be0d 100644 --- a/src/cybuffer.pyx +++ b/src/cybuffer.pyx @@ -176,10 +176,11 @@ cdef class cybuffer(object): def __dealloc__(self): - if self._shape != self._buf.shape: - cpython.mem.PyMem_Free(self._shape) - if self._strides != self._buf.strides: - cpython.mem.PyMem_Free(self._strides) + if PY2K: + if self._shape != self._buf.shape: + cpython.mem.PyMem_Free(self._shape) + if self._strides != self._buf.strides: + cpython.mem.PyMem_Free(self._strides) cpython.buffer.PyBuffer_Release(&self._buf) From 75cf5084d5639611da30344492b072526b63a799 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 19 Nov 2018 13:36:11 -0500 Subject: [PATCH 09/13] Fix memory map test for Windows On Windows there is no `PROT_WRITE`, only `ACCESS_WRITE`. So update the test to use the latter. This also works on Unix platforms. --- tests/test_cybuffer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cybuffer.py b/tests/test_cybuffer.py index 8ff3e12..b12a558 100644 --- a/tests/test_cybuffer.py +++ b/tests/test_cybuffer.py @@ -149,7 +149,7 @@ def test_1d_text_arrays(f, s): def test_mmap(): - with contextlib.closing(mmap.mmap(-1, 10, prot=mmap.PROT_WRITE)) as v: + with contextlib.closing(mmap.mmap(-1, 10, access=mmap.ACCESS_WRITE)) as v: # Initialize buffers b = cybuffer(v) m = memoryview(buffer(v)) From 92030e12859bde1adfc5de7be02e09c81be94a37 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 19 Nov 2018 14:37:05 -0500 Subject: [PATCH 10/13] Determine `Py_UNICODE_SIZE` from `array` Apparently as of Python 3.3, `sys.maxunicode` cannot be relied on to determine `Py_UNICODE_SIZE`. So instead determine this information using the `array` object directly. ref: https://www.python.org/dev/peps/pep-0393/ ref: https://bugs.python.org/issue13054 --- tests/test_cybuffer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_cybuffer.py b/tests/test_cybuffer.py index b12a558..5d1da29 100644 --- a/tests/test_cybuffer.py +++ b/tests/test_cybuffer.py @@ -20,6 +20,9 @@ buffer = memoryview +Py_UNICODE_SIZE = array.array('u').itemsize + + def test_empty_constructor(): with pytest.raises(TypeError): b = cybuffer() @@ -124,9 +127,9 @@ def test_1d_text_arrays(f, s): # Validate format assert b.itemsize == v.itemsize - if f is "u" and sys.maxunicode < 65536: + if f is "u" and Py_UNICODE_SIZE == 2: assert b.format == "H" - elif f is "u" and sys.maxunicode >= 65536: + elif f is "u" and Py_UNICODE_SIZE == 4: assert b.format == "I" elif f is "c": assert b.format == "B" From 4a3cc6215a3168312f00ff90b2d4c541f86005e6 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 19 Nov 2018 15:35:48 -0500 Subject: [PATCH 11/13] Add `hex` method to `cybuffer` Provides a Cython/Python method to get the hex representation of the data in a string (bytes or unicode on Python 2 or 3 respectively) much like the method of the Python 3 `memoryview`. Performance is very close to similar strategies with a `bytes` like object or `memoryview` respectively on either Python version using equivalent strategies. Also test this method against Python 3's `memoryview`'s `hex` method. Fallback to `binascii.hexlify` on Python 2. --- src/cybuffer.pxd | 1 + src/cybuffer.pyx | 13 +++++++++++++ tests/test_cybuffer.py | 5 +++++ 3 files changed, 19 insertions(+) diff --git a/src/cybuffer.pxd b/src/cybuffer.pxd index cf68ce5..acd4c6b 100644 --- a/src/cybuffer.pxd +++ b/src/cybuffer.pxd @@ -13,5 +13,6 @@ cdef class cybuffer(object): cdef readonly bint f_contiguous cdef readonly bint contiguous + cpdef str hex(self) cpdef bytes tobytes(self) cpdef list tolist(self) diff --git a/src/cybuffer.pyx b/src/cybuffer.pyx index db1be0d..d7bebd6 100644 --- a/src/cybuffer.pyx +++ b/src/cybuffer.pyx @@ -25,6 +25,9 @@ from cpython.buffer cimport ( from array import array +IF PY2K: + import binascii + include "version.pxi" @@ -254,6 +257,16 @@ cdef class cybuffer(object): mv[key] = value + cpdef str hex(self): + cdef str s + if PY2K: + s = binascii.hexlify(self.tobytes()) + else: + s = self.tobytes().hex() + + return s + + cpdef bytes tobytes(self): cdef bytes r cdef char* s diff --git a/tests/test_cybuffer.py b/tests/test_cybuffer.py index 5d1da29..7de5fdd 100644 --- a/tests/test_cybuffer.py +++ b/tests/test_cybuffer.py @@ -4,6 +4,7 @@ from __future__ import absolute_import import array +import binascii import contextlib import mmap import sys @@ -48,6 +49,10 @@ def validate_against_memoryview(v, b, m, suboffsets=tuple()): # Test methods assert b.tobytes() == m.tobytes() + if sys.version_info.major > 2: + assert b.hex() == m.hex() + else: + assert b.hex() == binascii.hexlify(m) @pytest.mark.parametrize("v", [ From d3cb3379c4bdac0b75fcb6d9b4ed4156ec1b5097 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 19 Nov 2018 22:34:31 -0500 Subject: [PATCH 12/13] Rename `Py_ssize_t` dimension length to `l` --- src/cybuffer.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cybuffer.pyx b/src/cybuffer.pyx index d7bebd6..2fc8830 100644 --- a/src/cybuffer.pyx +++ b/src/cybuffer.pyx @@ -79,17 +79,17 @@ cdef list cvmemoryview_to_list(cvmemoryview mv): cdef list r cdef cvmemoryview mv_i cdef object r_i - cdef Py_ssize_t i, n + cdef Py_ssize_t i, l - n = mv.view.shape[0] - r = cpython.list.PyList_New(n) + l = mv.view.shape[0] + r = cpython.list.PyList_New(l) if mv.view.ndim > 1: - for i in range(n): + for i in range(l): mv_i = mv[i] r_i = cvmemoryview_to_list(mv_i) PyList_SET_ITEM_INC(r, i, r_i) else: - for i in range(n): + for i in range(l): r_i = mv[i] PyList_SET_ITEM_INC(r, i, r_i) From e8b2a031b1b1b35104697c877b4dcfa492925202 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 19 Nov 2018 22:34:32 -0500 Subject: [PATCH 13/13] Optimize `cybuffer`'s method `tolist` Makes some improvements to `tolist` by rewriting the underlying utility function to operate directly on the raw pointer to the data instead of using Cython's `memoryview` object. Overall improves the speed of `tolist` by 4-fold. Also removes reliance on Cython's `memoryview` type for the `tolist` method, which saves a fair bit of overhead. --- src/cybuffer.pyx | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/cybuffer.pyx b/src/cybuffer.pyx index 2fc8830..6f9dbeb 100644 --- a/src/cybuffer.pyx +++ b/src/cybuffer.pyx @@ -24,6 +24,7 @@ from cpython.buffer cimport ( ) from array import array +from struct import unpack as struct_unpack IF PY2K: import binascii @@ -75,22 +76,24 @@ cdef tuple pointer_to_tuple(int n, Py_ssize_t* p): @cython.initializedcheck(False) @cython.nonecheck(False) @cython.wraparound(False) -cdef list cvmemoryview_to_list(cvmemoryview mv): +cdef list pointer_to_list(int n, Py_ssize_t* shape, Py_ssize_t* strides, + bytes fmt, Py_ssize_t itemsize, const char* d): cdef list r - cdef cvmemoryview mv_i cdef object r_i cdef Py_ssize_t i, l - l = mv.view.shape[0] + l = shape[0] r = cpython.list.PyList_New(l) - if mv.view.ndim > 1: + if n > 1: for i in range(l): - mv_i = mv[i] - r_i = cvmemoryview_to_list(mv_i) + r_i = pointer_to_list( + n - 1, &shape[1], &strides[1], + fmt, itemsize, d + i * strides[0] + ) PyList_SET_ITEM_INC(r, i, r_i) else: for i in range(l): - r_i = mv[i] + r_i = struct_unpack(fmt, (d + i * strides[0])[:itemsize])[0] PyList_SET_ITEM_INC(r, i, r_i) return r @@ -282,13 +285,10 @@ cdef class cybuffer(object): cpdef list tolist(self): - cdef list r - cdef cvmemoryview mv - - mv = cvmemoryview(self, PyBUF_FULL_RO) - r = cvmemoryview_to_list(mv) - - return r + return pointer_to_list( + self._buf.ndim, self._shape, self._strides, + self._format, self.itemsize, self._buf.buf + ) def __getbuffer__(self, Py_buffer* buf, int flags):