Skip to content

Commit

Permalink
refactor(proto): remove content_hash from builtin proto (#3058)
Browse files Browse the repository at this point in the history
* refactor(proto): remove content_hash from builtin proto

* refactor(proto): remove content_hash from builtin proto

* refactor(proto): remove content_hash from builtin proto

* refactor(proto): remove content_hash from builtin proto

* refactor(proto): remove content_hash from builtin proto

* refactor(proto): remove content_hash from builtin proto

* perf(proto): replace SerializeToString with SerializePartial

* perf(proto): replace SerializeToString with SerializePartial
  • Loading branch information
hanxiao committed Jul 31, 2021
1 parent b205a19 commit 390ff0a
Show file tree
Hide file tree
Showing 26 changed files with 160 additions and 273 deletions.
2 changes: 1 addition & 1 deletion jina/clients/base/websocket.py
Expand Up @@ -55,7 +55,7 @@ async def _get_results(
async def _send_requests(request_iterator):
next_request = None
for next_request in request_iterator:
await websocket.send(next_request.SerializeToString())
await websocket.send(next_request.SerializePartialToString())
self.num_requests += 1
# Check if there was any request generated
if next_request is not None:
Expand Down
4 changes: 1 addition & 3 deletions jina/clients/request/helper.py
Expand Up @@ -42,9 +42,7 @@ def _new_doc_from_data(
data, data_type: DataInputType, **kwargs
) -> Tuple['Document', 'DataInputType']:
def _build_doc_from_content():
with Document(**kwargs) as d:
d.content = data
return d, DataInputType.CONTENT
return Document(content=data, **kwargs), DataInputType.CONTENT

if data_type == DataInputType.AUTO or data_type == DataInputType.DOCUMENT:
if isinstance(data, Document):
Expand Down
3 changes: 0 additions & 3 deletions jina/proto/jina.proto
Expand Up @@ -84,9 +84,6 @@ message DocumentProto {
// A hexdigest that represents a unique document ID
string id = 1;

// A hexdigest that represents the hash of the content of the document
string content_hash = 24;

// the depth of the recursive chunk structure
uint32 granularity = 14;

Expand Down
133 changes: 63 additions & 70 deletions jina/proto/jina_pb2.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion jina/proto/serializer.py
Expand Up @@ -15,7 +15,7 @@ def SerializeToString(x: 'Request'):
# noqa: DAR102
# noqa: DAR201
"""
return x.proto.SerializeToString()
return x.proto.SerializePartialToString()

@staticmethod
def FromString(x: bytes):
Expand Down
13 changes: 8 additions & 5 deletions jina/types/arrays/chunk.py
Expand Up @@ -25,11 +25,11 @@ def __init__(self, doc_views, reference_doc: 'Document'):
self._ref_doc = reference_doc
super().__init__(doc_views)

def append(self, document: 'Document', **kwargs) -> 'Document':
def append(self, document: 'Document', copy: bool = True, **kwargs) -> 'Document':
"""Add a sub-document (i.e chunk) to the current Document.
:param document: Sub-document to be appended
:type document: :class:`Document`
:param copy: If set, then copy the original Document. Otherwise the original Document may get modified
:param kwargs: additional keyword arguments
:return: the newly added sub-document in :class:`Document` view
:rtype: :class:`Document`
Expand All @@ -39,17 +39,20 @@ def append(self, document: 'Document', **kwargs) -> 'Document':
make sure the added chunk is legit.
"""

from ..document import Document
if copy:
from ..document import Document

chunk = Document(document, copy=True)
chunk = Document(document, copy=True)
else:
# note: this is faster than Document(document, copy=False)
chunk = document

chunk.set_attributes(
parent_id=self._ref_doc.id, granularity=self.granularity, **kwargs
)

if not chunk.mime_type:
chunk.mime_type = self._ref_doc.mime_type
chunk.update_content_hash()
super().append(chunk)
return chunk

Expand Down
6 changes: 3 additions & 3 deletions jina/types/arrays/document.py
Expand Up @@ -201,7 +201,7 @@ def __iter__(self) -> Iterator['Document']:
from ..document import Document

for d in self._pb_body:
yield Document(d, hash_content=False)
yield Document(d)

def __contains__(self, item: str):
return item in self._id_to_index
Expand All @@ -210,7 +210,7 @@ def __getitem__(self, item: Union[int, str, slice]):
from ..document import Document

if isinstance(item, int):
return Document(self._pb_body[item], hash_content=False)
return Document(self._pb_body[item])
elif isinstance(item, str):
return self[self._id_to_index[item]]
elif isinstance(item, slice):
Expand Down Expand Up @@ -386,7 +386,7 @@ def save_binary(self, file: Union[str, BinaryIO]) -> None:
dap = jina_pb2.DocumentArrayProto()
if self._pb_body:
dap.docs.extend(self._pb_body)
fp.write(dap.SerializeToString())
fp.write(dap.SerializePartialToString())

def save_json(self, file: Union[str, TextIO]) -> None:
"""Save array elements into a JSON file.
Expand Down
3 changes: 1 addition & 2 deletions jina/types/arrays/match.py
Expand Up @@ -31,13 +31,12 @@ def append(self, document: 'Document', copy: bool = True, **kwargs) -> 'Document

match = Document(document, copy=True)
else:
# note: this is faster than Document(document, copy=False)
match = document

match.set_attributes(
granularity=self.granularity, adjacency=self.adjacency, **kwargs
)
for score in match.scores.values():
score.ref_id = self._ref_doc.id

super().append(match)
return match
Expand Down
56 changes: 15 additions & 41 deletions jina/types/document/__init__.py
Expand Up @@ -109,16 +109,6 @@ class Document(ProtoTypeMixin):
Jina requires each Document to have a string id. You can set a custom one,
or if non has been set a random one will be assigned.
Or you can use :class:`Document` as a context manager:
.. highlight:: python
.. code-block:: python
with Document() as d:
d.text = 'hello'
assert d.id # now `id` has value
To access and modify the content of the document, you can use :attr:`text`, :attr:`blob`, and :attr:`buffer`.
Each property is implemented with proper setter, to improve the integrity and user experience. For example,
assigning ``doc.blob`` or ``doc.embedding`` can be simply done via:
Expand Down Expand Up @@ -157,7 +147,6 @@ def __init__(
document: Optional[DocumentSourceType] = None,
field_resolver: Dict[str, str] = None,
copy: bool = False,
hash_content: bool = True,
**kwargs,
):
"""
Expand All @@ -173,7 +162,6 @@ def __init__(
names defined in Protobuf. This is only used when the given ``document`` is
a JSON string or a Python dict.
:param kwargs: other parameters to be set _after_ the document is constructed
:param hash_content: whether to hash the content of the Document
.. note::
Expand Down Expand Up @@ -291,8 +279,6 @@ def _update_doc(d: Dict):
f'Document content fields are mutually exclusive, please provide only one of {_all_doc_content_keys}'
)
self.set_attributes(**kwargs)
if hash_content and not copy:
self.update_content_hash()

def pop(self, *fields) -> None:
"""Remove the values from the given fields of this Document.
Expand Down Expand Up @@ -336,14 +322,6 @@ def modality(self, value: str):
"""
self._pb_body.modality = value

@property
def content_hash(self):
"""Get the content hash of the document.
:return: the content_hash from the proto
"""
return self._pb_body.content_hash

@property
def tags(self) -> StructView:
"""Return the `tags` field of this Document as a Python dict
Expand Down Expand Up @@ -401,7 +379,7 @@ def _update(
destination._pb_body.ClearField(field)
try:
setattr(destination, field, getattr(source, field))
except AttributeError: # some fields such as `content_hash` do not have a setter method.
except AttributeError:
setattr(destination._pb_body, field, getattr(source, field))

def update(
Expand All @@ -426,9 +404,14 @@ def update(
fields=fields,
)

def update_content_hash(
self,
fields: Tuple[str] = (
@property
def content_hash(self) -> str:
"""Get the document hash according to its content.
:return: the unique hash code to represent this Document
"""
# a tuple of field names that inclusive when computing content hash.
fields = (
'text',
'blob',
'buffer',
Expand All @@ -438,19 +421,14 @@ def update_content_hash(
'mime_type',
'granularity',
'adjacency',
),
) -> None:
"""Update the document hash according to its content.
:param fields: a tuple of field names that inclusive when computing content hash.
"""
)
masked_d = jina_pb2.DocumentProto()
present_fields = {
field_descriptor.name for field_descriptor, _ in self._pb_body.ListFields()
}
fields_to_hash = present_fields.intersection(fields)
FieldMask(paths=fields_to_hash).MergeMessage(self._pb_body, masked_d)
self._pb_body.content_hash = blake2b(
return blake2b(
masked_d.SerializePartialToString(), digest_size=DIGEST_SIZE
).hexdigest()

Expand Down Expand Up @@ -694,7 +672,7 @@ def set_attributes(self, **kwargs):
"""Bulk update Document fields with key-value specified in kwargs
.. seealso::
:meth:`get_attrs` for bulk get attributes
:meth:`get_attributes` for bulk get attributes
:param kwargs: the keyword arguments to set the values, where the keys are the fields to set
"""
Expand Down Expand Up @@ -854,15 +832,9 @@ def mime_type(self, value: str):
else:
self._pb_body.mime_type = value

def __enter__(self):
return self

def __eq__(self, other):
return self.proto == other.proto

def __exit__(self, exc_type, exc_val, exc_tb):
self.update_content_hash()

@property
def content_type(self) -> str:
"""Return the content type of the document, possible values: text, blob, buffer
Expand Down Expand Up @@ -1331,8 +1303,10 @@ def attributes(
def __getattr__(self, item):
if hasattr(self._pb_body, item):
value = getattr(self._pb_body, item)
else:
elif '__' in item:
value = dunder_get(self._pb_body, item)
else:
raise AttributeError
return value

@cached_property
Expand Down
28 changes: 1 addition & 27 deletions jina/types/document/multimodal.py
Expand Up @@ -115,10 +115,7 @@ def modality_content_map(self, value: Dict[str, Any]):
:param value: map from modality to content
"""
for modality, content in value.items():
with Document() as chunk:
chunk.modality = modality
chunk.content = content
self.chunks.append(chunk)
self.chunks.append(Document(modality=modality, content=content))

def __getitem__(self, modality: str) -> DocumentContentType:
"""Extract content by the name of the modality.
Expand All @@ -138,26 +135,3 @@ def modalities(self) -> List[str]:
:return: List of modalities extracted from chunks of the document.
"""
return list(self.modality_content_map.keys())

def update_content_hash(
self,
fields: Tuple[str] = (
'text',
'blob',
'buffer',
'embedding',
'uri',
'tags',
'mime_type',
'granularity',
'adjacency',
'parent_id',
'chunks',
),
) -> None:
"""
Update content hash of the document by including ``chunks`` when computing the hash
:param fields: a tuple of field names that included when computing content hash
"""
super().update_content_hash(fields)
2 changes: 1 addition & 1 deletion jina/types/message/__init__.py
Expand Up @@ -239,7 +239,7 @@ def dump(self) -> List[bytes]:
r2 = self._compress(r2)

r0 = self.envelope.receiver_id.encode()
r1 = self.envelope.SerializeToString()
r1 = self.envelope.SerializePartialToString()
m = [r0, r1, r2]
self._size = sum(sys.getsizeof(r) for r in m)
return m
Expand Down
2 changes: 1 addition & 1 deletion jina/types/mixin.py
Expand Up @@ -44,7 +44,7 @@ def binary_str(self) -> bytes:
:return: binary string representation of the object
"""
return self._pb_body.SerializeToString()
return self._pb_body.SerializePartialToString()

@property
def nbytes(self) -> int:
Expand Down
2 changes: 1 addition & 1 deletion jina/types/request/__init__.py
Expand Up @@ -216,7 +216,7 @@ def SerializeToString(self) -> bytes:
:return: serialized request
"""
if self.is_decompressed:
return self.proto.SerializeToString()
return self.proto.SerializePartialToString()
else:
# no touch, skip serialization, return original
return self._buffer
Expand Down
2 changes: 0 additions & 2 deletions tests/__init__.py
Expand Up @@ -38,7 +38,6 @@ def random_docs(
d.embedding = np.random.random(
[embed_dim + np.random.randint(0, jitter)]
)
d.update_content_hash()

for _ in range(chunks_per_doc):
chunk_doc_id = next_chunk_doc_id
Expand All @@ -51,7 +50,6 @@ def random_docs(
)
c.tags['parent_id'] = doc_id
c.tags['id'] = chunk_doc_id
c.update_content_hash()
d.chunks.append(c)
next_chunk_doc_id += 1

Expand Down
Expand Up @@ -34,8 +34,7 @@ def validate_output(resp):
os.environ['JINA_INDEXER_NEEDS'] = indexer_needs
os.environ['JINA_INDEXER_METHOD'] = indexer_method

with Document() as doc:
doc.content = text
doc = Document(content=text)

mock = mocker.Mock()
with Flow.load_config(flow_yml) as f:
Expand Down
Expand Up @@ -31,8 +31,7 @@ def validate_output(resp):
os.environ['JINA_ENCODER_NEEDS'] = encoder_needs
os.environ['JINA_INDEXER_NEEDS'] = indexer_needs

with Document() as doc:
doc.content = text
doc = Document(content=text)

mock = mocker.Mock()
with Flow.load_config(flow_yml) as f:
Expand Down
Expand Up @@ -31,6 +31,7 @@
SHARDS = 3
EMB_SIZE = 10


# TODO: Move to JinaDClient


Expand Down Expand Up @@ -159,12 +160,12 @@ def _send_rest_request(

def _get_documents(nr=10, index_start=0, emb_size=7):
for i in range(index_start, nr + index_start):
with Document() as d:
d.id = i
d.text = f'hello world {i}'
d.embedding = np.random.random(emb_size)
d.tags['tag_field'] = f'tag data {i}'
yield d
yield Document(
id=i,
text=f'hello world {i}',
embedding=np.random.random(emb_size),
tags={'tag_field': f'tag data {i}'},
)


def _jinad_dump(pod_name, dump_path, shards, url):
Expand Down

0 comments on commit 390ff0a

Please sign in to comment.