Skip to content

Commit

Permalink
ARROW-6878: [Python] Fix creating array from list of dicts with bytes…
Browse files Browse the repository at this point in the history
… keys
  • Loading branch information
pitrou committed Oct 14, 2019
1 parent d7ad509 commit 4b466c0
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 13 deletions.
77 changes: 64 additions & 13 deletions cpp/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -824,7 +824,8 @@ class StructConverter
num_fields_ = this->typed_builder_->num_fields();
DCHECK_EQ(num_fields_, struct_type->num_children());

field_name_list_.reset(PyList_New(num_fields_));
field_name_bytes_list_.reset(PyList_New(num_fields_));
field_name_unicode_list_.reset(PyList_New(num_fields_));
RETURN_IF_PYERROR();

// Initialize the child converters and field names
Expand All @@ -839,10 +840,13 @@ class StructConverter
value_converters_.push_back(std::move(value_converter));

// Store the field name as a PyObject, for dict matching
PyObject* nameobj =
PyObject* bytesobj =
PyBytes_FromStringAndSize(field_name.c_str(), field_name.size());
PyObject* unicodeobj =
PyUnicode_FromStringAndSize(field_name.c_str(), field_name.size());
RETURN_IF_PYERROR();
PyList_SET_ITEM(field_name_list_.obj(), i, nameobj);
PyList_SET_ITEM(field_name_bytes_list_.obj(), i, bytesobj);
PyList_SET_ITEM(field_name_unicode_list_.obj(), i, unicodeobj);
}

return Status::OK();
Expand All @@ -851,16 +855,16 @@ class StructConverter
Status AppendItem(PyObject* obj) {
RETURN_NOT_OK(this->typed_builder_->Append());
// Note heterogenous sequences are not allowed
if (ARROW_PREDICT_FALSE(source_kind_ == UNKNOWN)) {
if (ARROW_PREDICT_FALSE(source_kind_ == SourceKind::UNKNOWN)) {
if (PyDict_Check(obj)) {
source_kind_ = DICTS;
source_kind_ = SourceKind::DICTS;
} else if (PyTuple_Check(obj)) {
source_kind_ = TUPLES;
source_kind_ = SourceKind::TUPLES;
}
}
if (PyDict_Check(obj) && source_kind_ == DICTS) {
if (PyDict_Check(obj) && source_kind_ == SourceKind::DICTS) {
return AppendDictItem(obj);
} else if (PyTuple_Check(obj) && source_kind_ == TUPLES) {
} else if (PyTuple_Check(obj) && source_kind_ == SourceKind::TUPLES) {
return AppendTupleItem(obj);
} else {
return internal::InvalidType(obj,
Expand All @@ -882,11 +886,52 @@ class StructConverter

protected:
Status AppendDictItem(PyObject* obj) {
// NOTE we're ignoring any extraneous dict items
if (dict_key_kind_ == DictKeyKind::UNICODE) {
return AppendDictItemWithUnicodeKeys(obj);
}
if (dict_key_kind_ == DictKeyKind::BYTES) {
return AppendDictItemWithBytesKeys(obj);
}
for (int i = 0; i < num_fields_; i++) {
PyObject* nameobj = PyList_GET_ITEM(field_name_list_.obj(), i);
PyObject* valueobj = PyDict_GetItem(obj, nameobj); // borrowed
PyObject* nameobj = PyList_GET_ITEM(field_name_unicode_list_.obj(), i);
PyObject* valueobj = PyDict_GetItem(obj, nameobj);
if (valueobj != NULL) {
dict_key_kind_ = DictKeyKind::UNICODE;
return AppendDictItemWithUnicodeKeys(obj);
}
RETURN_IF_PYERROR();
// Unicode key not present, perhaps bytes key is?
nameobj = PyList_GET_ITEM(field_name_bytes_list_.obj(), i);
valueobj = PyDict_GetItem(obj, nameobj);
if (valueobj != NULL) {
dict_key_kind_ = DictKeyKind::BYTES;
return AppendDictItemWithBytesKeys(obj);
}
RETURN_IF_PYERROR();
}
// If we come here, it means all keys are absent
for (int i = 0; i < num_fields_; i++) {
RETURN_NOT_OK(value_converters_[i]->AppendSingleVirtual(Py_None));
}
return Status::OK();
}

Status AppendDictItemWithBytesKeys(PyObject* obj) {
return AppendDictItem(obj, field_name_bytes_list_.obj());
}

Status AppendDictItemWithUnicodeKeys(PyObject* obj) {
return AppendDictItem(obj, field_name_unicode_list_.obj());
}

Status AppendDictItem(PyObject* obj, PyObject* field_name_list) {
// NOTE we're ignoring any extraneous dict items
for (int i = 0; i < num_fields_; i++) {
PyObject* nameobj = PyList_GET_ITEM(field_name_list, i); // borrowed
PyObject* valueobj = PyDict_GetItem(obj, nameobj); // borrowed
if (valueobj == NULL) {
RETURN_IF_PYERROR();
}
RETURN_NOT_OK(
value_converters_[i]->AppendSingleVirtual(valueobj ? valueobj : Py_None));
}
Expand All @@ -905,10 +950,16 @@ class StructConverter
}

std::vector<std::unique_ptr<SeqConverter>> value_converters_;
OwnedRef field_name_list_;
OwnedRef field_name_unicode_list_;
OwnedRef field_name_bytes_list_;
int num_fields_;
// Whether we're converting from a sequence of dicts or tuples
enum { UNKNOWN, DICTS, TUPLES } source_kind_ = UNKNOWN;
enum class SourceKind { UNKNOWN, DICTS, TUPLES } source_kind_ = SourceKind::UNKNOWN;
enum class DictKeyKind {
UNKNOWN,
BYTES,
UNICODE
} dict_key_kind_ = DictKeyKind::UNKNOWN;
bool from_pandas_;
bool strict_conversions_;
};
Expand Down
17 changes: 17 additions & 0 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -1216,6 +1216,23 @@ def test_struct_from_dicts():
assert arr.to_pylist() == expected


def test_struct_from_dicts_bytes_keys():
# ARROW-6878
ty = pa.struct([pa.field('a', pa.int32()),
pa.field('b', pa.string()),
pa.field('c', pa.bool_())])
arr = pa.array([], type=ty)
assert arr.to_pylist() == []

data = [{b'a': 5, b'b': 'foo'},
{b'a': 6, b'c': False}]
arr = pa.array(data, type=ty)
assert arr.to_pylist() == [
{'a': 5, 'b': 'foo', 'c': None},
{'a': 6, 'b': None, 'c': False},
]


def test_struct_from_tuples():
ty = pa.struct([pa.field('a', pa.int32()),
pa.field('b', pa.string()),
Expand Down

0 comments on commit 4b466c0

Please sign in to comment.