From 4b466c02be85c02e937c7604300b259f7dde356f Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 14 Oct 2019 16:08:20 -0400 Subject: [PATCH] ARROW-6878: [Python] Fix creating array from list of dicts with bytes keys --- cpp/src/arrow/python/python_to_arrow.cc | 77 ++++++++++++++++---- python/pyarrow/tests/test_convert_builtin.py | 17 +++++ 2 files changed, 81 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 0bccd52ebe5b3..235dee5ef5271 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -824,7 +824,8 @@ class StructConverter num_fields_ = this->typed_builder_->num_fields(); DCHECK_EQ(num_fields_, struct_type->num_children()); - field_name_list_.reset(PyList_New(num_fields_)); + field_name_bytes_list_.reset(PyList_New(num_fields_)); + field_name_unicode_list_.reset(PyList_New(num_fields_)); RETURN_IF_PYERROR(); // Initialize the child converters and field names @@ -839,10 +840,13 @@ class StructConverter value_converters_.push_back(std::move(value_converter)); // Store the field name as a PyObject, for dict matching - PyObject* nameobj = + PyObject* bytesobj = + PyBytes_FromStringAndSize(field_name.c_str(), field_name.size()); + PyObject* unicodeobj = PyUnicode_FromStringAndSize(field_name.c_str(), field_name.size()); RETURN_IF_PYERROR(); - PyList_SET_ITEM(field_name_list_.obj(), i, nameobj); + PyList_SET_ITEM(field_name_bytes_list_.obj(), i, bytesobj); + PyList_SET_ITEM(field_name_unicode_list_.obj(), i, unicodeobj); } return Status::OK(); @@ -851,16 +855,16 @@ class StructConverter Status AppendItem(PyObject* obj) { RETURN_NOT_OK(this->typed_builder_->Append()); // Note heterogenous sequences are not allowed - if (ARROW_PREDICT_FALSE(source_kind_ == UNKNOWN)) { + if (ARROW_PREDICT_FALSE(source_kind_ == SourceKind::UNKNOWN)) { if (PyDict_Check(obj)) { - source_kind_ = DICTS; + source_kind_ = SourceKind::DICTS; } else if (PyTuple_Check(obj)) { - source_kind_ = TUPLES; + source_kind_ = SourceKind::TUPLES; } } - if (PyDict_Check(obj) && source_kind_ == DICTS) { + if (PyDict_Check(obj) && source_kind_ == SourceKind::DICTS) { return AppendDictItem(obj); - } else if (PyTuple_Check(obj) && source_kind_ == TUPLES) { + } else if (PyTuple_Check(obj) && source_kind_ == SourceKind::TUPLES) { return AppendTupleItem(obj); } else { return internal::InvalidType(obj, @@ -882,11 +886,52 @@ class StructConverter protected: Status AppendDictItem(PyObject* obj) { - // NOTE we're ignoring any extraneous dict items + if (dict_key_kind_ == DictKeyKind::UNICODE) { + return AppendDictItemWithUnicodeKeys(obj); + } + if (dict_key_kind_ == DictKeyKind::BYTES) { + return AppendDictItemWithBytesKeys(obj); + } for (int i = 0; i < num_fields_; i++) { - PyObject* nameobj = PyList_GET_ITEM(field_name_list_.obj(), i); - PyObject* valueobj = PyDict_GetItem(obj, nameobj); // borrowed + PyObject* nameobj = PyList_GET_ITEM(field_name_unicode_list_.obj(), i); + PyObject* valueobj = PyDict_GetItem(obj, nameobj); + if (valueobj != NULL) { + dict_key_kind_ = DictKeyKind::UNICODE; + return AppendDictItemWithUnicodeKeys(obj); + } + RETURN_IF_PYERROR(); + // Unicode key not present, perhaps bytes key is? + nameobj = PyList_GET_ITEM(field_name_bytes_list_.obj(), i); + valueobj = PyDict_GetItem(obj, nameobj); + if (valueobj != NULL) { + dict_key_kind_ = DictKeyKind::BYTES; + return AppendDictItemWithBytesKeys(obj); + } RETURN_IF_PYERROR(); + } + // If we come here, it means all keys are absent + for (int i = 0; i < num_fields_; i++) { + RETURN_NOT_OK(value_converters_[i]->AppendSingleVirtual(Py_None)); + } + return Status::OK(); + } + + Status AppendDictItemWithBytesKeys(PyObject* obj) { + return AppendDictItem(obj, field_name_bytes_list_.obj()); + } + + Status AppendDictItemWithUnicodeKeys(PyObject* obj) { + return AppendDictItem(obj, field_name_unicode_list_.obj()); + } + + Status AppendDictItem(PyObject* obj, PyObject* field_name_list) { + // NOTE we're ignoring any extraneous dict items + for (int i = 0; i < num_fields_; i++) { + PyObject* nameobj = PyList_GET_ITEM(field_name_list, i); // borrowed + PyObject* valueobj = PyDict_GetItem(obj, nameobj); // borrowed + if (valueobj == NULL) { + RETURN_IF_PYERROR(); + } RETURN_NOT_OK( value_converters_[i]->AppendSingleVirtual(valueobj ? valueobj : Py_None)); } @@ -905,10 +950,16 @@ class StructConverter } std::vector> value_converters_; - OwnedRef field_name_list_; + OwnedRef field_name_unicode_list_; + OwnedRef field_name_bytes_list_; int num_fields_; // Whether we're converting from a sequence of dicts or tuples - enum { UNKNOWN, DICTS, TUPLES } source_kind_ = UNKNOWN; + enum class SourceKind { UNKNOWN, DICTS, TUPLES } source_kind_ = SourceKind::UNKNOWN; + enum class DictKeyKind { + UNKNOWN, + BYTES, + UNICODE + } dict_key_kind_ = DictKeyKind::UNKNOWN; bool from_pandas_; bool strict_conversions_; }; diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index c4bc360e9078c..aff55eedd381c 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1216,6 +1216,23 @@ def test_struct_from_dicts(): assert arr.to_pylist() == expected +def test_struct_from_dicts_bytes_keys(): + # ARROW-6878 + ty = pa.struct([pa.field('a', pa.int32()), + pa.field('b', pa.string()), + pa.field('c', pa.bool_())]) + arr = pa.array([], type=ty) + assert arr.to_pylist() == [] + + data = [{b'a': 5, b'b': 'foo'}, + {b'a': 6, b'c': False}] + arr = pa.array(data, type=ty) + assert arr.to_pylist() == [ + {'a': 5, 'b': 'foo', 'c': None}, + {'a': 6, 'b': None, 'c': False}, + ] + + def test_struct_from_tuples(): ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()),