Skip to content

Commit

Permalink
ARROW-6878: [Python] Fix creating array from list of dicts with bytes…
Browse files Browse the repository at this point in the history
… keys

Closes apache#5652 from pitrou/ARROW-6878 and squashes the following commits:

4b466c0 <Antoine Pitrou> ARROW-6878:  Fix creating array from list of dicts with bytes keys

Authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Antoine Pitrou <pitrou@free.fr>
  • Loading branch information
pitrou authored and kszucs committed Oct 21, 2019
1 parent 669e663 commit 4569f97
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 13 deletions.
77 changes: 64 additions & 13 deletions cpp/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,8 @@ class StructConverter
num_fields_ = this->typed_builder_->num_fields();
DCHECK_EQ(num_fields_, struct_type->num_children());

field_name_list_.reset(PyList_New(num_fields_));
field_name_bytes_list_.reset(PyList_New(num_fields_));
field_name_unicode_list_.reset(PyList_New(num_fields_));
RETURN_IF_PYERROR();

// Initialize the child converters and field names
Expand All @@ -786,10 +787,13 @@ class StructConverter
value_converters_.push_back(std::move(value_converter));

// Store the field name as a PyObject, for dict matching
PyObject* nameobj =
PyObject* bytesobj =
PyBytes_FromStringAndSize(field_name.c_str(), field_name.size());
PyObject* unicodeobj =
PyUnicode_FromStringAndSize(field_name.c_str(), field_name.size());
RETURN_IF_PYERROR();
PyList_SET_ITEM(field_name_list_.obj(), i, nameobj);
PyList_SET_ITEM(field_name_bytes_list_.obj(), i, bytesobj);
PyList_SET_ITEM(field_name_unicode_list_.obj(), i, unicodeobj);
}

return Status::OK();
Expand All @@ -798,16 +802,16 @@ class StructConverter
Status AppendItem(PyObject* obj) {
RETURN_NOT_OK(this->typed_builder_->Append());
// Note heterogenous sequences are not allowed
if (ARROW_PREDICT_FALSE(source_kind_ == UNKNOWN)) {
if (ARROW_PREDICT_FALSE(source_kind_ == SourceKind::UNKNOWN)) {
if (PyDict_Check(obj)) {
source_kind_ = DICTS;
source_kind_ = SourceKind::DICTS;
} else if (PyTuple_Check(obj)) {
source_kind_ = TUPLES;
source_kind_ = SourceKind::TUPLES;
}
}
if (PyDict_Check(obj) && source_kind_ == DICTS) {
if (PyDict_Check(obj) && source_kind_ == SourceKind::DICTS) {
return AppendDictItem(obj);
} else if (PyTuple_Check(obj) && source_kind_ == TUPLES) {
} else if (PyTuple_Check(obj) && source_kind_ == SourceKind::TUPLES) {
return AppendTupleItem(obj);
} else {
return internal::InvalidType(obj,
Expand All @@ -829,11 +833,52 @@ class StructConverter

protected:
Status AppendDictItem(PyObject* obj) {
// NOTE we're ignoring any extraneous dict items
if (dict_key_kind_ == DictKeyKind::UNICODE) {
return AppendDictItemWithUnicodeKeys(obj);
}
if (dict_key_kind_ == DictKeyKind::BYTES) {
return AppendDictItemWithBytesKeys(obj);
}
for (int i = 0; i < num_fields_; i++) {
PyObject* nameobj = PyList_GET_ITEM(field_name_list_.obj(), i);
PyObject* valueobj = PyDict_GetItem(obj, nameobj); // borrowed
PyObject* nameobj = PyList_GET_ITEM(field_name_unicode_list_.obj(), i);
PyObject* valueobj = PyDict_GetItem(obj, nameobj);
if (valueobj != NULL) {
dict_key_kind_ = DictKeyKind::UNICODE;
return AppendDictItemWithUnicodeKeys(obj);
}
RETURN_IF_PYERROR();
// Unicode key not present, perhaps bytes key is?
nameobj = PyList_GET_ITEM(field_name_bytes_list_.obj(), i);
valueobj = PyDict_GetItem(obj, nameobj);
if (valueobj != NULL) {
dict_key_kind_ = DictKeyKind::BYTES;
return AppendDictItemWithBytesKeys(obj);
}
RETURN_IF_PYERROR();
}
// If we come here, it means all keys are absent
for (int i = 0; i < num_fields_; i++) {
RETURN_NOT_OK(value_converters_[i]->AppendSingleVirtual(Py_None));
}
return Status::OK();
}

Status AppendDictItemWithBytesKeys(PyObject* obj) {
return AppendDictItem(obj, field_name_bytes_list_.obj());
}

Status AppendDictItemWithUnicodeKeys(PyObject* obj) {
return AppendDictItem(obj, field_name_unicode_list_.obj());
}

Status AppendDictItem(PyObject* obj, PyObject* field_name_list) {
// NOTE we're ignoring any extraneous dict items
for (int i = 0; i < num_fields_; i++) {
PyObject* nameobj = PyList_GET_ITEM(field_name_list, i); // borrowed
PyObject* valueobj = PyDict_GetItem(obj, nameobj); // borrowed
if (valueobj == NULL) {
RETURN_IF_PYERROR();
}
RETURN_NOT_OK(
value_converters_[i]->AppendSingleVirtual(valueobj ? valueobj : Py_None));
}
Expand All @@ -852,10 +897,16 @@ class StructConverter
}

std::vector<std::unique_ptr<SeqConverter>> value_converters_;
OwnedRef field_name_list_;
OwnedRef field_name_unicode_list_;
OwnedRef field_name_bytes_list_;
int num_fields_;
// Whether we're converting from a sequence of dicts or tuples
enum { UNKNOWN, DICTS, TUPLES } source_kind_ = UNKNOWN;
enum class SourceKind { UNKNOWN, DICTS, TUPLES } source_kind_ = SourceKind::UNKNOWN;
enum class DictKeyKind {
UNKNOWN,
BYTES,
UNICODE
} dict_key_kind_ = DictKeyKind::UNKNOWN;
bool from_pandas_;
bool strict_conversions_;
};
Expand Down
17 changes: 17 additions & 0 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -1099,6 +1099,23 @@ def test_struct_from_dicts():
assert arr.to_pylist() == expected


def test_struct_from_dicts_bytes_keys():
# ARROW-6878
ty = pa.struct([pa.field('a', pa.int32()),
pa.field('b', pa.string()),
pa.field('c', pa.bool_())])
arr = pa.array([], type=ty)
assert arr.to_pylist() == []

data = [{b'a': 5, b'b': 'foo'},
{b'a': 6, b'c': False}]
arr = pa.array(data, type=ty)
assert arr.to_pylist() == [
{'a': 5, 'b': 'foo', 'c': None},
{'a': 6, 'b': None, 'c': False},
]


def test_struct_from_tuples():
ty = pa.struct([pa.field('a', pa.int32()),
pa.field('b', pa.string()),
Expand Down

0 comments on commit 4569f97

Please sign in to comment.