Skip to content

Commit

Permalink
feat: use schema-ed json as default to_dict/json (#57)
Browse files Browse the repository at this point in the history
  • Loading branch information
hanxiao committed Jan 17, 2022
1 parent 86b03f3 commit a96787e
Show file tree
Hide file tree
Showing 13 changed files with 212 additions and 85 deletions.
2 changes: 1 addition & 1 deletion docarray/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.2.1'
__version__ = '0.3.0'

from .document import Document
from .array import DocumentArray
13 changes: 5 additions & 8 deletions docarray/array/mixins/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,12 @@ def save_csv(
if with_header:
writer.writeheader()

from .... import Document

for d in self:
_d = d
if exclude_fields:
_d = Document(d, copy=True)
_d.pop(*exclude_fields)

pd = _d.to_dict()
pd = d.to_dict(
protocol='jsonschema',
exclude=set(exclude_fields) if exclude_fields else None,
exclude_none=True,
)
if flatten_tags:
t = pd.pop('tags')
pd.update({f'tag__{k}': v for k, v in t.items()})
Expand Down
44 changes: 28 additions & 16 deletions docarray/array/mixins/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@
class JsonIOMixin:
"""Save/load a array into a JSON file."""

def save_json(self, file: Union[str, TextIO]) -> None:
def save_json(
self, file: Union[str, TextIO], protocol: str = 'jsonschema', **kwargs
) -> None:
"""Save array elements into a JSON file.
Comparing to :meth:`save_binary`, it is human-readable but slower to save/load and the file size larger.
:param file: File or filename to which the data is saved.
:param protocol: `jsonschema` or `protobuf`
"""
if hasattr(file, 'write'):
file_ctx = nullcontext(file)
Expand All @@ -24,15 +27,17 @@ def save_json(self, file: Union[str, TextIO]) -> None:

with file_ctx as fp:
for d in self:
json.dump(d.to_dict(), fp)
json.dump(d.to_dict(protocol=protocol, **kwargs), fp)
fp.write('\n')

@classmethod
def load_json(cls: Type['T'], file: Union[str, TextIO]) -> 'T':
def load_json(
cls: Type['T'], file: Union[str, TextIO], protocol: str = 'jsonschema', **kwargs
) -> 'T':
"""Load array elements from a JSON file.
:param file: File or filename or a JSON string to which the data is saved.
:param protocol: `jsonschema` or `protobuf`
:return: a DocumentArrayLike object
"""

Expand All @@ -48,31 +53,38 @@ def load_json(cls: Type['T'], file: Union[str, TextIO]) -> 'T':
constructor = Document.from_dict

with file_ctx as fp:
return cls(constructor(v) for v in fp)
return cls(constructor(v, protocol=protocol, **kwargs) for v in fp)

@classmethod
def from_json(cls: Type['T'], file: Union[str, TextIO]) -> 'T':
return cls.load_json(file)
def from_json(
cls: Type['T'], file: Union[str, TextIO], protocol: str = 'jsonschema', **kwargs
) -> 'T':
return cls.load_json(file, protocol=protocol, **kwargs)

@classmethod
def from_list(cls: Type['T'], values: List) -> 'T':
def from_list(
cls: Type['T'], values: List, protocol: str = 'jsonschema', **kwargs
) -> 'T':
from .... import Document

return cls(Document.from_dict(v) for v in values)
return cls(Document.from_dict(v, protocol=protocol, **kwargs) for v in values)

def to_list(self, strict: bool = True) -> List:
def to_list(self, protocol: str = 'jsonschema', **kwargs) -> List:
"""Convert the object into a Python list.
.. note::
Array like object such as :class:`numpy.ndarray` will be converted to Python list.
:param protocol: `jsonschema` or `protobuf`
:return: a Python list
"""
return [d.to_dict(strict=strict) for d in self]
return [d.to_dict(protocol=protocol, **kwargs) for d in self]

def to_json(self) -> str:
def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str:
"""Convert the object into a JSON string. Can be loaded via :meth:`.load_json`.
:param protocol: `jsonschema` or `protobuf`
:return: a Python list
"""
return json.dumps(self.to_list())
return json.dumps(self.to_list(protocol=protocol, **kwargs))

# to comply with Document interfaces but less semantically accurate
to_dict = to_list
from_dict = from_list
2 changes: 1 addition & 1 deletion docarray/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def __init__(
if _unknown_kwargs and unknown_fields_handler == 'catch':
getattr(self, self._unresolved_fields_dest).update(_unknown_kwargs)

if _obj is None and not kwargs and self._data is None:
if not _obj and not kwargs and self._data is None:
self._data = self._data_class(self)

if self._data is None:
Expand Down
99 changes: 76 additions & 23 deletions docarray/document/mixins/porting.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import dataclasses
import pickle
import warnings
from typing import Optional, TYPE_CHECKING, Type, Dict, Any
import base64

Expand All @@ -11,32 +12,76 @@

class PortingMixin:
@classmethod
def from_dict(cls: Type['T'], obj: Dict) -> 'T':
from google.protobuf import json_format
from ...proto.docarray_pb2 import DocumentProto
def from_dict(
cls: Type['T'], obj: Dict, protocol: str = 'jsonschema', **kwargs
) -> 'T':
"""Convert a dict object into a Document.
:param obj: a Python dict object
:param protocol: `jsonschema` or `protobuf`
:param kwargs: extra key-value args pass to pydantic and protobuf parser.
:return: the parsed Document object
"""
if protocol == 'jsonschema':
from ..pydantic_model import PydanticDocument

return cls.from_pydantic_model(PydanticDocument.parse_obj(obj, **kwargs))
elif protocol == 'protobuf':
from google.protobuf import json_format
from ...proto.docarray_pb2 import DocumentProto

pb_msg = DocumentProto()
json_format.ParseDict(obj, pb_msg)
return cls.from_protobuf(pb_msg)
pb_msg = DocumentProto()
json_format.ParseDict(obj, pb_msg, **kwargs)
return cls.from_protobuf(pb_msg)
else:
raise ValueError(f'protocol=`{protocol}` is not supported')

@classmethod
def from_json(cls: Type['T'], obj: str) -> 'T':
from google.protobuf import json_format
from ...proto.docarray_pb2 import DocumentProto
def from_json(
cls: Type['T'], obj: str, protocol: str = 'jsonschema', **kwargs
) -> 'T':
"""Convert a JSON string into a Document.
:param obj: a valid JSON string
:param protocol: `jsonschema` or `protobuf`
:param kwargs: extra key-value args pass to pydantic and protobuf parser.
:return: the parsed Document object
"""
if protocol == 'jsonschema':
from ..pydantic_model import PydanticDocument

return cls.from_pydantic_model(PydanticDocument.parse_raw(obj, **kwargs))
elif protocol == 'protobuf':
from google.protobuf import json_format
from ...proto.docarray_pb2 import DocumentProto

pb_msg = DocumentProto()
json_format.Parse(obj, pb_msg, **kwargs)
return cls.from_protobuf(pb_msg)
else:
raise ValueError(f'protocol=`{protocol}` is not supported')

pb_msg = DocumentProto()
json_format.Parse(obj, pb_msg)
return cls.from_protobuf(pb_msg)
def to_dict(self, protocol: str = 'jsonschema', **kwargs) -> Dict[str, Any]:
"""Convert itself into a Python dict object.
def to_dict(self, strict: bool = True) -> Dict[str, Any]:
if strict:
:param protocol: `jsonschema` or `protobuf`
:param kwargs: extra key-value args pass to pydantic and protobuf dumper.
:return: the dumped Document as a dict object
"""
if protocol == 'jsonschema':
return self.to_pydantic_model().dict(**kwargs)
elif protocol == 'protobuf':
from google.protobuf.json_format import MessageToDict

return MessageToDict(
self.to_protobuf(),
preserving_proto_field_name=True,
**kwargs,
)
else:
warnings.warn(
f'protocol=`{protocol}` is not supported, '
f'the result dict is a Python dynamic typing dict without any promise on the schema.'
)
return dataclasses.asdict(self._data)

def to_bytes(
Expand Down Expand Up @@ -68,25 +113,33 @@ def from_bytes(
"""
bstr = decompress_bytes(data, algorithm=compress)
if protocol == 'pickle':
d = pickle.loads(bstr)
return pickle.loads(bstr)
elif protocol == 'protobuf':
from ...proto.docarray_pb2 import DocumentProto

pb_msg = DocumentProto()
pb_msg.ParseFromString(bstr)
d = cls.from_protobuf(pb_msg)
return cls.from_protobuf(pb_msg)
else:
raise ValueError(
f'protocol={protocol} is not supported. Can be only `protobuf` or pickle protocols 0-5.'
)
return d

def to_json(self) -> str:
from google.protobuf.json_format import MessageToJson
def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str:
"""Convert itself into a JSON string.
:param protocol: `jsonschema` or `protobuf`
:param kwargs: extra key-value args pass to pydantic and protobuf dumper.
:return: the dumped JSON string
"""
if protocol == 'jsonschema':
return self.to_pydantic_model().json(**kwargs)
elif protocol == 'protobuf':
from google.protobuf.json_format import MessageToJson

return MessageToJson(
self.to_protobuf(), preserving_proto_field_name=True, sort_keys=True
)
return MessageToJson(self.to_protobuf(), **kwargs)
else:
raise ValueError(f'protocol={protocol} is not supported.')

def to_base64(
self, protocol: str = 'pickle', compress: Optional[str] = None
Expand Down
13 changes: 10 additions & 3 deletions docarray/document/mixins/pydantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,11 @@ def from_pydantic_model(cls: Type['T'], model: 'BaseModel') -> 'T':
from ... import Document

fields = {}
_field_chunks, _field_matches = None, None
if model.chunks:
fields['chunks'] = [Document.from_pydantic_model(d) for d in model.chunks]
_field_chunks = [Document.from_pydantic_model(d) for d in model.chunks]
if model.matches:
fields['matches'] = [Document.from_pydantic_model(d) for d in model.matches]
_field_matches = [Document.from_pydantic_model(d) for d in model.matches]

for (field, value) in model.dict(
exclude_none=True, exclude={'chunks', 'matches'}
Expand All @@ -66,4 +67,10 @@ def from_pydantic_model(cls: Type['T'], model: 'BaseModel') -> 'T':
fields[f_name] = np.array(value)
else:
fields[f_name] = value
return Document(**fields)

d = Document(**fields)
if _field_chunks:
d.chunks = _field_chunks
if _field_matches:
d.matches = _field_matches
return d
3 changes: 2 additions & 1 deletion docarray/document/pydantic_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@


def _convert_ndarray_to_list(v: 'ArrayType'):
return to_list(v)
if v is not None:
return to_list(v)


class PydanticDocument(BaseModel):
Expand Down
Loading

0 comments on commit a96787e

Please sign in to comment.