refactor(proto): remove content_hash from builtin proto (#3058)

* refactor(proto): remove content_hash from builtin proto * refactor(proto): remove content_hash from builtin proto * refactor(proto): remove content_hash from builtin proto * refactor(proto): remove content_hash from builtin proto * refactor(proto): remove content_hash from builtin proto * refactor(proto): remove content_hash from builtin proto * perf(proto): replace SerializeToString with SerializePartial * perf(proto): replace SerializeToString with SerializePartial
jina-ai · Jul 31, 2021 · 390ff0a · 390ff0a
1 parent b205a19
commit 390ff0a
Show file tree

Hide file tree

Showing 26 changed files with 160 additions and 273 deletions.
diff --git a/jina/clients/base/websocket.py b/jina/clients/base/websocket.py
@@ -55,7 +55,7 @@ async def _get_results(
                 async def _send_requests(request_iterator):
                     next_request = None
                     for next_request in request_iterator:
-                        await websocket.send(next_request.SerializeToString())
+                        await websocket.send(next_request.SerializePartialToString())
                         self.num_requests += 1
                     # Check if there was any request generated
                     if next_request is not None:

diff --git a/jina/clients/request/helper.py b/jina/clients/request/helper.py
@@ -42,9 +42,7 @@ def _new_doc_from_data(
     data, data_type: DataInputType, **kwargs
 ) -> Tuple['Document', 'DataInputType']:
     def _build_doc_from_content():
-        with Document(**kwargs) as d:
-            d.content = data
-        return d, DataInputType.CONTENT
+        return Document(content=data, **kwargs), DataInputType.CONTENT
 
     if data_type == DataInputType.AUTO or data_type == DataInputType.DOCUMENT:
         if isinstance(data, Document):

diff --git a/jina/proto/jina.proto b/jina/proto/jina.proto
@@ -84,9 +84,6 @@ message DocumentProto {
     // A hexdigest that represents a unique document ID
     string id = 1;
 
-    // A hexdigest that represents the hash of the content of the document
-    string content_hash = 24;
-
     // the depth of the recursive chunk structure
     uint32 granularity = 14;
 

diff --git a/jina/proto/jina_pb2.py b/jina/proto/jina_pb2.py
diff --git a/jina/proto/serializer.py b/jina/proto/serializer.py
@@ -15,7 +15,7 @@ def SerializeToString(x: 'Request'):
         # noqa: DAR102
         # noqa: DAR201
         """
-        return x.proto.SerializeToString()
+        return x.proto.SerializePartialToString()
 
     @staticmethod
     def FromString(x: bytes):

diff --git a/jina/types/arrays/chunk.py b/jina/types/arrays/chunk.py
@@ -25,11 +25,11 @@ def __init__(self, doc_views, reference_doc: 'Document'):
         self._ref_doc = reference_doc
         super().__init__(doc_views)
 
-    def append(self, document: 'Document', **kwargs) -> 'Document':
+    def append(self, document: 'Document', copy: bool = True, **kwargs) -> 'Document':
         """Add a sub-document (i.e chunk) to the current Document.
 
         :param document: Sub-document to be appended
-        :type document: :class:`Document`
+        :param copy: If set, then copy the original Document. Otherwise the original Document may get modified
         :param kwargs: additional keyword arguments
         :return: the newly added sub-document in :class:`Document` view
         :rtype: :class:`Document`
@@ -39,17 +39,20 @@ def append(self, document: 'Document', **kwargs) -> 'Document':
             make sure the added chunk is legit.
         """
 
-        from ..document import Document
+        if copy:
+            from ..document import Document
 
-        chunk = Document(document, copy=True)
+            chunk = Document(document, copy=True)
+        else:
+            # note: this is faster than Document(document, copy=False)
+            chunk = document
 
         chunk.set_attributes(
             parent_id=self._ref_doc.id, granularity=self.granularity, **kwargs
         )
 
         if not chunk.mime_type:
             chunk.mime_type = self._ref_doc.mime_type
-        chunk.update_content_hash()
         super().append(chunk)
         return chunk
 

diff --git a/jina/types/arrays/document.py b/jina/types/arrays/document.py
@@ -201,7 +201,7 @@ def __iter__(self) -> Iterator['Document']:
         from ..document import Document
 
         for d in self._pb_body:
-            yield Document(d, hash_content=False)
+            yield Document(d)
 
     def __contains__(self, item: str):
         return item in self._id_to_index
@@ -210,7 +210,7 @@ def __getitem__(self, item: Union[int, str, slice]):
         from ..document import Document
 
         if isinstance(item, int):
-            return Document(self._pb_body[item], hash_content=False)
+            return Document(self._pb_body[item])
         elif isinstance(item, str):
             return self[self._id_to_index[item]]
         elif isinstance(item, slice):
@@ -386,7 +386,7 @@ def save_binary(self, file: Union[str, BinaryIO]) -> None:
             dap = jina_pb2.DocumentArrayProto()
             if self._pb_body:
                 dap.docs.extend(self._pb_body)
-            fp.write(dap.SerializeToString())
+            fp.write(dap.SerializePartialToString())
 
     def save_json(self, file: Union[str, TextIO]) -> None:
         """Save array elements into a JSON file.

diff --git a/jina/types/arrays/match.py b/jina/types/arrays/match.py
@@ -31,13 +31,12 @@ def append(self, document: 'Document', copy: bool = True, **kwargs) -> 'Document
 
             match = Document(document, copy=True)
         else:
+            # note: this is faster than Document(document, copy=False)
             match = document
 
         match.set_attributes(
             granularity=self.granularity, adjacency=self.adjacency, **kwargs
         )
-        for score in match.scores.values():
-            score.ref_id = self._ref_doc.id
 
         super().append(match)
         return match

diff --git a/jina/types/document/__init__.py b/jina/types/document/__init__.py
@@ -109,16 +109,6 @@ class Document(ProtoTypeMixin):
     Jina requires each Document to have a string id. You can set a custom one,
     or if non has been set a random one will be assigned.
 
-    Or you can use :class:`Document` as a context manager:
-
-        .. highlight:: python
-        .. code-block:: python
-
-            with Document() as d:
-                d.text = 'hello'
-
-            assert d.id  # now `id` has value
-
     To access and modify the content of the document, you can use :attr:`text`, :attr:`blob`, and :attr:`buffer`.
     Each property is implemented with proper setter, to improve the integrity and user experience. For example,
     assigning ``doc.blob`` or ``doc.embedding`` can be simply done via:
@@ -157,7 +147,6 @@ def __init__(
         document: Optional[DocumentSourceType] = None,
         field_resolver: Dict[str, str] = None,
         copy: bool = False,
-        hash_content: bool = True,
         **kwargs,
     ):
         """
@@ -173,7 +162,6 @@ def __init__(
                 names defined in Protobuf. This is only used when the given ``document`` is
                 a JSON string or a Python dict.
         :param kwargs: other parameters to be set _after_ the document is constructed
-        :param hash_content: whether to hash the content of the Document
 
         .. note::
 
@@ -291,8 +279,6 @@ def _update_doc(d: Dict):
                 f'Document content fields are mutually exclusive, please provide only one of {_all_doc_content_keys}'
             )
         self.set_attributes(**kwargs)
-        if hash_content and not copy:
-            self.update_content_hash()
 
     def pop(self, *fields) -> None:
         """Remove the values from the given fields of this Document.
@@ -336,14 +322,6 @@ def modality(self, value: str):
         """
         self._pb_body.modality = value
 
-    @property
-    def content_hash(self):
-        """Get the content hash of the document.
-
-        :return: the content_hash from the proto
-        """
-        return self._pb_body.content_hash
-
     @property
     def tags(self) -> StructView:
         """Return the `tags` field of this Document as a Python dict
@@ -401,7 +379,7 @@ def _update(
                 destination._pb_body.ClearField(field)
                 try:
                     setattr(destination, field, getattr(source, field))
-                except AttributeError:  # some fields such as `content_hash` do not have a setter method.
+                except AttributeError:
                     setattr(destination._pb_body, field, getattr(source, field))
 
     def update(
@@ -426,9 +404,14 @@ def update(
             fields=fields,
         )
 
-    def update_content_hash(
-        self,
-        fields: Tuple[str] = (
+    @property
+    def content_hash(self) -> str:
+        """Get the document hash according to its content.
+
+        :return: the unique hash code to represent this Document
+        """
+        # a tuple of field names that inclusive when computing content hash.
+        fields = (
             'text',
             'blob',
             'buffer',
@@ -438,19 +421,14 @@ def update_content_hash(
             'mime_type',
             'granularity',
             'adjacency',
-        ),
-    ) -> None:
-        """Update the document hash according to its content.
-
-        :param fields: a tuple of field names that inclusive when computing content hash.
-        """
+        )
         masked_d = jina_pb2.DocumentProto()
         present_fields = {
             field_descriptor.name for field_descriptor, _ in self._pb_body.ListFields()
         }
         fields_to_hash = present_fields.intersection(fields)
         FieldMask(paths=fields_to_hash).MergeMessage(self._pb_body, masked_d)
-        self._pb_body.content_hash = blake2b(
+        return blake2b(
             masked_d.SerializePartialToString(), digest_size=DIGEST_SIZE
         ).hexdigest()
 
@@ -694,7 +672,7 @@ def set_attributes(self, **kwargs):
         """Bulk update Document fields with key-value specified in kwargs
 
         .. seealso::
-            :meth:`get_attrs` for bulk get attributes
+            :meth:`get_attributes` for bulk get attributes
 
         :param kwargs: the keyword arguments to set the values, where the keys are the fields to set
         """
@@ -854,15 +832,9 @@ def mime_type(self, value: str):
             else:
                 self._pb_body.mime_type = value
 
-    def __enter__(self):
-        return self
-
     def __eq__(self, other):
         return self.proto == other.proto
 
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.update_content_hash()
-
     @property
     def content_type(self) -> str:
         """Return the content type of the document, possible values: text, blob, buffer
@@ -1331,8 +1303,10 @@ def attributes(
     def __getattr__(self, item):
         if hasattr(self._pb_body, item):
             value = getattr(self._pb_body, item)
-        else:
+        elif '__' in item:
             value = dunder_get(self._pb_body, item)
+        else:
+            raise AttributeError
         return value
 
     @cached_property

diff --git a/jina/types/document/multimodal.py b/jina/types/document/multimodal.py
@@ -115,10 +115,7 @@ def modality_content_map(self, value: Dict[str, Any]):
         :param value: map from modality to content
         """
         for modality, content in value.items():
-            with Document() as chunk:
-                chunk.modality = modality
-                chunk.content = content
-                self.chunks.append(chunk)
+            self.chunks.append(Document(modality=modality, content=content))
 
     def __getitem__(self, modality: str) -> DocumentContentType:
         """Extract content by the name of the modality.
@@ -138,26 +135,3 @@ def modalities(self) -> List[str]:
         :return: List of modalities extracted from chunks of the document.
         """
         return list(self.modality_content_map.keys())
-
-    def update_content_hash(
-        self,
-        fields: Tuple[str] = (
-            'text',
-            'blob',
-            'buffer',
-            'embedding',
-            'uri',
-            'tags',
-            'mime_type',
-            'granularity',
-            'adjacency',
-            'parent_id',
-            'chunks',
-        ),
-    ) -> None:
-        """
-        Update content hash of the document by including ``chunks`` when computing the hash
-
-        :param fields: a tuple of field names that included when computing content hash
-        """
-        super().update_content_hash(fields)
diff --git a/jina/types/message/__init__.py b/jina/types/message/__init__.py
@@ -239,7 +239,7 @@ def dump(self) -> List[bytes]:
         r2 = self._compress(r2)
 
         r0 = self.envelope.receiver_id.encode()
-        r1 = self.envelope.SerializeToString()
+        r1 = self.envelope.SerializePartialToString()
         m = [r0, r1, r2]
         self._size = sum(sys.getsizeof(r) for r in m)
         return m

diff --git a/jina/types/mixin.py b/jina/types/mixin.py
@@ -44,7 +44,7 @@ def binary_str(self) -> bytes:
 
         :return: binary string representation of the object
         """
-        return self._pb_body.SerializeToString()
+        return self._pb_body.SerializePartialToString()
 
     @property
     def nbytes(self) -> int:

diff --git a/jina/types/request/__init__.py b/jina/types/request/__init__.py
@@ -216,7 +216,7 @@ def SerializeToString(self) -> bytes:
         :return: serialized request
         """
         if self.is_decompressed:
-            return self.proto.SerializeToString()
+            return self.proto.SerializePartialToString()
         else:
             # no touch, skip serialization, return original
             return self._buffer

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -38,7 +38,6 @@ def random_docs(
                 d.embedding = np.random.random(
                     [embed_dim + np.random.randint(0, jitter)]
                 )
-        d.update_content_hash()
 
         for _ in range(chunks_per_doc):
             chunk_doc_id = next_chunk_doc_id
@@ -51,7 +50,6 @@ def random_docs(
                 )
             c.tags['parent_id'] = doc_id
             c.tags['id'] = chunk_doc_id
-            c.update_content_hash()
             d.chunks.append(c)
             next_chunk_doc_id += 1
 

diff --git a/tests/distributed/test_local_flow_local_remote_local/test_integration.py b/tests/distributed/test_local_flow_local_remote_local/test_integration.py
@@ -34,8 +34,7 @@ def validate_output(resp):
     os.environ['JINA_INDEXER_NEEDS'] = indexer_needs
     os.environ['JINA_INDEXER_METHOD'] = indexer_method
 
-    with Document() as doc:
-        doc.content = text
+    doc = Document(content=text)
 
     mock = mocker.Mock()
     with Flow.load_config(flow_yml) as f:

diff --git a/tests/distributed/test_local_flow_remote_local_remote/test_integration.py b/tests/distributed/test_local_flow_remote_local_remote/test_integration.py
@@ -31,8 +31,7 @@ def validate_output(resp):
     os.environ['JINA_ENCODER_NEEDS'] = encoder_needs
     os.environ['JINA_INDEXER_NEEDS'] = indexer_needs
 
-    with Document() as doc:
-        doc.content = text
+    doc = Document(content=text)
 
     mock = mocker.Mock()
     with Flow.load_config(flow_yml) as f:

diff --git a/tests/distributed/test_remote_flow_dump_rolling_update/test_dump_dbms_remote.py b/tests/distributed/test_remote_flow_dump_rolling_update/test_dump_dbms_remote.py
@@ -31,6 +31,7 @@
 SHARDS = 3
 EMB_SIZE = 10
 
+
 # TODO: Move to JinaDClient
 
 
@@ -159,12 +160,12 @@ def _send_rest_request(
 
 def _get_documents(nr=10, index_start=0, emb_size=7):
     for i in range(index_start, nr + index_start):
-        with Document() as d:
-            d.id = i
-            d.text = f'hello world {i}'
-            d.embedding = np.random.random(emb_size)
-            d.tags['tag_field'] = f'tag data {i}'
-        yield d
+        yield Document(
+            id=i,
+            text=f'hello world {i}',
+            embedding=np.random.random(emb_size),
+            tags={'tag_field': f'tag data {i}'},
+        )
 
 
 def _jinad_dump(pod_name, dump_path, shards, url):