From c533035f3d8feae2f2ab95b9ef0b67be9eed7235 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 23 Nov 2018 22:42:33 -0500 Subject: [PATCH 1/5] Only patch-up `array` buffer on Python 2/3 When initializing `cybuffer` with an `array, ensure that we only patch up the `format` and associated attributes for the `array` buffer on Python 2/3. This workaround is needed for Python 2 as the `array` type does not support the (new) buffer protocol. So we need to use the old buffer protocol and then the (new) buffer protocol with that result. This loses information about the `array` in the process, which is the bulk of the reason this is needed. Python 3's `array` supports the (new) buffer protocol; so, we can largely ignore these workarounds. However on Python 2/3, we have the `"u"` (unicode) format for the `array`, which is a little broken. So we workaround those issues by casting the array to an unsigned integer of equivalent width to the unicode type. However the `"u"` format for `array` is deprecated and scheduled for removal in Python 4. So there should be no need to do any of this work on Python 4. As such, check whether Python 2/3 is in use when checking if the data is provided is an `array`. That way this whole branch will automatically be removed on Python 4 when it lands. --- src/cybuffer.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cybuffer.pyx b/src/cybuffer.pyx index 883d42d..7b34fe6 100644 --- a/src/cybuffer.pyx +++ b/src/cybuffer.pyx @@ -162,7 +162,7 @@ cdef class cybuffer(object): # Workaround some special cases with the builtin array cdef size_t len_nd_b cdef int n_1 - if isinstance(self.obj, array): + if (PY2K or PY3K) and isinstance(self.obj, array): # Fix-up typecode typecode = self.obj.typecode if typecode == "B": @@ -170,7 +170,7 @@ cdef class cybuffer(object): elif PY2K and typecode == "c": self._format = UBYTE_TC return - elif (PY2K or PY3K) and typecode == "u": + elif typecode == "u": if Py_UNICODE_SIZE == 2: self._format = UCS2_TC elif Py_UNICODE_SIZE == 4: From 071aade2e3e8d6d9990d9f0167fc0d7cf88f1c67 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 23 Nov 2018 23:14:08 -0500 Subject: [PATCH 2/5] Define `Py_UNICODE_SIZE` at Cython compile-time Previously we dealt with `Py_UNICODE_SIZE` as a C compile-time constant, which would inject the value during the C preprocessor pass. Later compilation optimizations by the C compiler likely removed unneeded branches caused by this. Though this is pretty needless as Cython is generating these branches that we later drop. Not to mention Cython sometimes assigns constants like these to intermediate or stack local variables, which can hinder compiler optimizations. It would be better yet to have Cython not define these branches at all. To that end, we make `Py_UNICODE_SIZE` a Cython compile-time constant that is set in `config.pxi` along with other compile-time constants. As a result, Cython injects these directly in `cybuffer.pxd` and `cybuffer.pxi` before even performing any compilation. Once it goes to compile the Cython code to C, it recognizes `Py_UNICODE_SIZE` is a compile-time constant and evaluates all expressions it is used in. As a result, the Cython branches with `Py_UNICODE_SIZE` are eliminated during compile-time leaving only one unconditional code path in C. In order to discern what `Py_UNICODE_SIZE` should be at the Python level, we inspect the `itemsize` of an empty unicode (i.e. `"u"`) `array`'s `itemsize` in `setup.py`. Based on this we can determine `Py_UNICODE_SIZE`'s value. We then inject this into `config.pxi`. This value is only inspected and added for Python 2/3 as the unicode `array` is scheduled for removal in Python 4. As we only check this value within a branch of code that should be removed on Python 4 anyways, leaving `Py_UNICODE_SIZE` undefined shouldn't be an issue for Python 4 builds. While `Py_UNICODE_SIZE` is defined in Python headers, we don't need to be worried about our definition of `Py_UNICODE_SIZE` clashing with it. The reason being Cython guarantees the assigned value will injected into the source. So the definition will have already been eliminated by the time the C code is generated. Further we only use `Py_UNICODE_SIZE` in comparisons to other constants. So Cython will inject the value and then evaluate these comparisons resulting in them being turned into always `True` or `False` branches. These in turn are optimized away. --- setup.py | 7 +++++++ src/cybuffer.pyx | 2 -- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 42592e4..0745845 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import array import glob import os import sys @@ -68,6 +69,12 @@ def run_tests(self): "DEF PY2K = " + str(sys.version_info.major == 2) + "\n", "DEF PY3K = " + str(sys.version_info.major == 3) + "\n" ]) + if sys.version_info.major < 4: + Py_UNICODE_SIZE = array.array('u').itemsize + f.writelines([ + "DEF Py_UNICODE_SIZE = " + str(Py_UNICODE_SIZE) + "\n", + ]) + with open("src/version.pxi", "w") as f: f.writelines([ "__version__ = " + "\"" + str(version) + "\"" diff --git a/src/cybuffer.pyx b/src/cybuffer.pyx index 7b34fe6..e79293b 100644 --- a/src/cybuffer.pyx +++ b/src/cybuffer.pyx @@ -33,8 +33,6 @@ include "version.pxi" cdef extern from "Python.h": - size_t Py_UNICODE_SIZE - object PyMemoryView_FromObject(object obj) From c89c982ba39833e6a6836b77651c571c4209dad0 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 23 Nov 2018 23:28:47 -0500 Subject: [PATCH 3/5] Drop character array format assignment on Python 2 On Python 2, the `format` of all `array`s is `uint8` (i.e. `"B"`). The reason being that all `array`s on Python 2 only support the old buffer protocol. As such they need to be converted through that path to get access to use them with the (new) buffer protocol. Doing so results in such objects being treated as if they are of `uint8` type. Since that is how we want character (i.e. `"c"`) arrays treated on Python 2 (they don't exist on Python 3), then there is nothing we need to here. This includes changing the `_format`. So drop that line. --- src/cybuffer.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cybuffer.pyx b/src/cybuffer.pyx index e79293b..6c92bba 100644 --- a/src/cybuffer.pyx +++ b/src/cybuffer.pyx @@ -166,7 +166,6 @@ cdef class cybuffer(object): if typecode == "B": return elif PY2K and typecode == "c": - self._format = UBYTE_TC return elif typecode == "u": if Py_UNICODE_SIZE == 2: From 35781156db426f22afad1a6e979da9c334dc810a Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 23 Nov 2018 23:35:30 -0500 Subject: [PATCH 4/5] Fuse no-op `array` cases Go ahead and join the different no-op cases with `array` types into one condition. Just makes it easier to see that all of these are just being skipped. Besides Cython will break this out however it sees fit in C code. So there is no benefit to us doing anything special here. --- src/cybuffer.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/cybuffer.pyx b/src/cybuffer.pyx index 6c92bba..549080f 100644 --- a/src/cybuffer.pyx +++ b/src/cybuffer.pyx @@ -163,9 +163,7 @@ cdef class cybuffer(object): if (PY2K or PY3K) and isinstance(self.obj, array): # Fix-up typecode typecode = self.obj.typecode - if typecode == "B": - return - elif PY2K and typecode == "c": + if typecode == "B" or (PY2K and typecode == "c"): return elif typecode == "u": if Py_UNICODE_SIZE == 2: From 7b0903904c701f162e06c72945a1a8d2580d7ba7 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 24 Nov 2018 00:23:39 -0500 Subject: [PATCH 5/5] Set itemsize and format for `array` on Python 2 Go ahead and set `itemsize` at the same time that `format` is being set for the Python 2 `array` cases. These two properties are connected and naturally go together. So this makes a bit more sense from a code readability point of view. --- src/cybuffer.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cybuffer.pyx b/src/cybuffer.pyx index 549080f..0057e1b 100644 --- a/src/cybuffer.pyx +++ b/src/cybuffer.pyx @@ -166,17 +166,18 @@ cdef class cybuffer(object): if typecode == "B" or (PY2K and typecode == "c"): return elif typecode == "u": + if PY2K: + self.itemsize = Py_UNICODE_SIZE if Py_UNICODE_SIZE == 2: self._format = UCS2_TC elif Py_UNICODE_SIZE == 4: self._format = UCS4_TC elif PY2K: + self.itemsize = self.obj.itemsize self._format = typecode - # Adjust itemsize, shape, and strides based on casting + # Adjust shape and strides based on casting if PY2K: - self.itemsize = self.obj.itemsize - len_nd_b = self._buf.ndim * sizeof(Py_ssize_t) self._shape = cpython.mem.PyMem_Malloc(len_nd_b) self._strides = cpython.mem.PyMem_Malloc(len_nd_b)