⚠️ Refactor rename length attribute to siblings - breaking change (#2198

) * refactor: rename length attribute to siblings * refactor: delete length from document * fix: black exclude reformat * fix: black exclude reformat * fix: black exclude reformat
jina-ai · Mar 19, 2021 · 762f619 · 762f619
1 parent f7e3355
commit 762f619
Show file tree

Hide file tree

Showing 14 changed files with 1,438 additions and 2,780 deletions.
diff --git a/jina/__init__.py b/jina/__init__.py
@@ -60,7 +60,7 @@
 
 # do not change this line manually
 # this is managed by proto/build-proto.sh and updated on every execution
-__proto_version__ = '0.0.78'
+__proto_version__ = '0.0.79'
 
 __uptime__ = _datetime.datetime.now().isoformat()
 
@@ -143,9 +143,12 @@
 
 def _set_nofile(nofile_atleast=4096):
     """
-    sets nofile soft limit to at least 4096, useful for running matlplotlib/seaborn on
+    Set nofile soft limit to at least 4096, useful for running matlplotlib/seaborn on
     parallel executing plot generators vs. Ubuntu default ulimit -n 1024 or OS X El Captian 256
     temporary setting extinguishing with Python session.
+
+    :param nofile_atleast: nofile soft limit
+    :return: nofile soft limit and nofile hard limit
     """
 
     try:

diff --git a/jina/clients/request/__init__.py b/jina/clients/request/__init__.py
@@ -48,7 +48,7 @@ def request_generator(
     :param kwargs: additional arguments
     :yield: request
     """
-    _kwargs = dict(mime_type=mime_type, length=request_size, weight=1.0)
+    _kwargs = dict(mime_type=mime_type, weight=1.0)
 
     try:
         if not isinstance(data, Iterable):

diff --git a/jina/clients/request/asyncio.py b/jina/clients/request/asyncio.py
@@ -36,7 +36,7 @@ async def request_generator(
     :param kwargs: additional key word arguments
     :yield: request
     """
-    _kwargs = dict(mime_type=mime_type, length=request_size, weight=1.0)
+    _kwargs = dict(mime_type=mime_type, weight=1.0)
 
     try:
         with ImportExtensions(required=True):

diff --git a/jina/drivers/segment.py b/jina/drivers/segment.py
@@ -54,9 +54,8 @@ def _apply_all(self, docs: 'DocumentSet', *args, **kwargs):
 
     @staticmethod
     def _add_chunks(doc, chunks):
-        num_siblings = len(chunks)
         for chunk in chunks:
-            with Document(length=num_siblings, **chunk) as c:
+            with Document(**chunk) as c:
                 if not c.mime_type:
                     c.mime_type = doc.mime_type
             doc.chunks.append(c)
diff --git a/jina/proto/jina.proto b/jina/proto/jina.proto
@@ -106,9 +106,6 @@ message DocumentProto {
     // the weight of this document
     float weight = 5;
 
-    // total number of siblings of this document (docs that are in the same granularity and parent_id)
-    uint32 length = 6;
-
     // the top-k matched Docs on the same level (recursive structure)
     repeated DocumentProto matches = 8;
 

diff --git a/jina/proto/jina_pb2.py b/jina/proto/jina_pb2.py
diff --git a/jina/proto/jina_pb2_grpc.py b/jina/proto/jina_pb2_grpc.py
@@ -17,10 +17,10 @@ def __init__(self, channel):
             channel: A grpc.Channel.
         """
         self.Call = channel.stream_stream(
-            '/jina.JinaRPC/Call',
-            request_serializer=jina__pb2.RequestProto.SerializeToString,
-            response_deserializer=jina__pb2.RequestProto.FromString,
-        )
+                '/jina.JinaRPC/Call',
+                request_serializer=jina__pb2.RequestProto.SerializeToString,
+                response_deserializer=jina__pb2.RequestProto.FromString,
+                )
 
 
 class JinaRPCServicer(object):
@@ -29,57 +29,45 @@ class JinaRPCServicer(object):
     """
 
     def Call(self, request_iterator, context):
-        """Pass in a Request and a filled Request with matches will be returned."""
+        """Pass in a Request and a filled Request with matches will be returned.
+        """
         context.set_code(grpc.StatusCode.UNIMPLEMENTED)
         context.set_details('Method not implemented!')
         raise NotImplementedError('Method not implemented!')
 
 
 def add_JinaRPCServicer_to_server(servicer, server):
     rpc_method_handlers = {
-        'Call': grpc.stream_stream_rpc_method_handler(
-            servicer.Call,
-            request_deserializer=jina__pb2.RequestProto.FromString,
-            response_serializer=jina__pb2.RequestProto.SerializeToString,
-        ),
+            'Call': grpc.stream_stream_rpc_method_handler(
+                    servicer.Call,
+                    request_deserializer=jina__pb2.RequestProto.FromString,
+                    response_serializer=jina__pb2.RequestProto.SerializeToString,
+            ),
     }
     generic_handler = grpc.method_handlers_generic_handler(
-        'jina.JinaRPC', rpc_method_handlers
-    )
+            'jina.JinaRPC', rpc_method_handlers)
     server.add_generic_rpc_handlers((generic_handler,))
 
 
-# This class is part of an EXPERIMENTAL API.
+ # This class is part of an EXPERIMENTAL API.
 class JinaRPC(object):
     """*
     jina gRPC service.
     """
 
     @staticmethod
-    def Call(
-        request_iterator,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.stream_stream(
-            request_iterator,
+    def Call(request_iterator,
             target,
-            '/jina.JinaRPC/Call',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.stream_stream(request_iterator, target, '/jina.JinaRPC/Call',
             jina__pb2.RequestProto.SerializeToString,
             jina__pb2.RequestProto.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-        )
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
diff --git a/jina/types/document/__init__.py b/jina/types/document/__init__.py
@@ -204,24 +204,6 @@ def __init__(
         self.set_attrs(**kwargs)
         self._mermaid_id = random_identity()  #: for mermaid visualize id
 
-    @property
-    def length(self) -> int:
-        """
-        The number of siblings of the :class:``Document``
-
-        .. # noqa: DAR201
-        :getter: number of siblings
-        :setter: number of siblings
-        :type: int
-        """
-        # TODO(Han): rename this to siblings as this shadows the built-in `length`
-
-        return self._pb_body.length
-
-    @length.setter
-    def length(self, value: int):
-        self._pb_body.length = value
-
     @property
     def weight(self) -> float:
         """

diff --git a/tests/unit/clients/python/test_request.py b/tests/unit/clients/python/test_request.py
@@ -88,7 +88,6 @@ def random_lines(num_lines):
     request = next(req)
     assert len(request.index.docs) == 100
     for index, doc in enumerate(request.index.docs, 1):
-        assert doc.length == 100
         assert doc.mime_type == 'text/plain'
         assert doc.text == f'i\'m dummy doc {index}'
 
@@ -102,7 +101,6 @@ def random_lines(num_lines):
     request = next(req)
     assert len(request.index.docs) == 100
     for index, doc in enumerate(request.index.docs, 1):
-        assert doc.length == 100
         assert doc.mime_type == 'text/plain'
         assert doc.text == f'i\'m dummy doc {index}'
 
@@ -117,7 +115,6 @@ def random_lines(num_lines):
     request = next(req)
     assert len(request.index.docs) == 100
     for index, doc in enumerate(request.index.docs, 1):
-        assert doc.length == 100
         assert doc.mime_type == 'text/plain'
         assert doc.text == f'https://github.com i\'m dummy doc {index}'
 
@@ -132,7 +129,6 @@ def random_lines(num_lines):
     request = next(req)
     assert len(request.index.docs) == 100
     for index, doc in enumerate(request.index.docs, 1):
-        assert doc.length == 100
         assert doc.text == f'i\'m dummy doc {index}'
         assert doc.mime_type == 'text/plain'
 
@@ -152,7 +148,6 @@ def random_docs(num_docs):
     request = next(req)
     assert len(request.index.docs) == 100
     for index, doc in enumerate(request.index.docs, 1):
-        assert doc.length == 100
         assert doc.mime_type == 'mime_type'
         assert doc.text == f'i\'m dummy doc {index}'
         assert doc.offset == 1000
@@ -226,13 +221,11 @@ def test_request_generate_numpy_arrays():
     request = next(req)
     assert len(request.index.docs) == 5
     for index, doc in enumerate(request.index.docs, 1):
-        assert doc.length == 5
         assert NdArray(doc.blob).value.shape == (10,)
 
     request = next(req)
     assert len(request.index.docs) == 5
     for index, doc in enumerate(request.index.docs, 1):
-        assert doc.length == 5
         assert NdArray(doc.blob).value.shape == (10,)
 
 
@@ -248,11 +241,9 @@ def generator():
     request = next(req)
     assert len(request.index.docs) == 5
     for index, doc in enumerate(request.index.docs, 1):
-        assert doc.length == 5
         assert NdArray(doc.blob).value.shape == (10,)
 
     request = next(req)
     assert len(request.index.docs) == 5
     for index, doc in enumerate(request.index.docs, 1):
-        assert doc.length == 5
         assert NdArray(doc.blob).value.shape == (10,)
diff --git a/tests/unit/drivers/rank/aggregate/test_aggregate_matches_rank_driver.py b/tests/unit/drivers/rank/aggregate/test_aggregate_matches_rank_driver.py
@@ -50,14 +50,14 @@ def docs(self):
 class MockLengthRanker(Chunk2DocRanker):
     def __init__(self, *args, **kwargs):
         super().__init__(
-            query_required_keys=('length',),
-            match_required_keys=('length',),
+            query_required_keys=('weight',),
+            match_required_keys=('weight',),
             *args,
             **kwargs
         )
 
     def score(self, match_idx, query_chunk_meta, match_chunk_meta, *args, **kwargs):
-        return match_chunk_meta[match_idx[0][self.COL_DOC_CHUNK_ID]]['length']
+        return match_chunk_meta[match_idx[0][self.COL_DOC_CHUNK_ID]]['weight']
 
 
 def create_document_to_score_same_depth_level():
@@ -70,7 +70,7 @@ def create_document_to_score_same_depth_level():
     doc = Document()
     doc.id = 1
 
-    for match_id, parent_id, match_score, match_length in [
+    for match_id, parent_id, match_score, weight in [
         (2, 20, 30, 3),
         (3, 20, 40, 4),
         (4, 30, 20, 2),
@@ -79,7 +79,7 @@ def create_document_to_score_same_depth_level():
         match = Document()
         match.id = match_id
         match.parent_id = parent_id
-        match.length = match_length
+        match.weight = weight
         match.score = NamedScore(value=match_score, ref_id=doc.id)
         doc.matches.append(match)
     return doc
@@ -94,9 +94,9 @@ def test_collect_matches2doc_ranker_driver_mock_ranker():
     dm = list(doc.matches)
     assert len(dm) == 2
     assert dm[0].id == '20'
-    assert dm[0].score.value == 3
+    assert dm[0].score.value == 3.0
     assert dm[1].id == '30'
-    assert dm[1].score.value == 2
+    assert dm[1].score.value == 2.0
     for match in dm:
         # match score is computed w.r.t to doc.id
         assert match.score.ref_id == doc.id

diff --git a/tests/unit/drivers/rank/aggregate/test_chunk2doc_rank_drivers.py b/tests/unit/drivers/rank/aggregate/test_chunk2doc_rank_drivers.py
@@ -22,14 +22,14 @@ def score(self, match_idx, query_chunk_meta, match_chunk_meta, *args, **kwargs):
 class MockLengthRanker(Chunk2DocRanker):
     def __init__(self, *args, **kwargs):
         super().__init__(
-            query_required_keys=['length'],
-            match_required_keys=['length'],
+            query_required_keys=['weight'],
+            match_required_keys=['weight'],
             *args,
             **kwargs
         )
 
     def score(self, match_idx, query_chunk_meta, match_chunk_meta, *args, **kwargs):
-        return match_chunk_meta[match_idx[0][self.COL_DOC_CHUNK_ID]]['length']
+        return match_chunk_meta[match_idx[0][self.COL_DOC_CHUNK_ID]]['weight']
 
 
 class MockPriceDiscountRanker(Chunk2DocRanker):
@@ -84,11 +84,11 @@ def create_document_to_score():
             match.id = str(match_id)
             parent_id = 10 * int(match_id)
             match.parent_id = str(parent_id)
-            match.length = int(match_id)
             # to be used by MaxRanker and MinRanker
             match.score = NamedScore(value=int(match_id), ref_id=chunk.id)
             match.tags['price'] = match.score.value
             match.tags['discount'] = DISCOUNT_VAL
+            match.weight = 2 * int(chunk_id) + m
             chunk.matches.append(match)
         doc.chunks.append(chunk)
     return doc
@@ -117,7 +117,6 @@ def create_chunk_matches_to_score():
             match.parent_id = str(parent_id)
             match.score = NamedScore(value=score_value, ref_id=chunk.id)
             match.id = str(10 * int(parent_id) + score_value)
-            match.length = 4
             chunk.matches.append(match)
         doc.chunks.append(chunk)
     return doc
@@ -150,7 +149,6 @@ def create_chunk_chunk_matches_to_score():
             match.parent_id = str(parent_id)
             match.score = NamedScore(value=score_value, ref_id=chunk_chunk.id)
             match.id = str(10 * parent_id + score_value)
-            match.length = 4
             chunk_chunk.matches.append(match)
         chunk.chunks.append(chunk_chunk)
     doc.chunks.append(chunk)