fix: fixed the int float conversion collision between hash and idy (#…

…1034) * fix: fixed the int float conversion collision between hash and idy * fix: updated jina hub submodule
jina-ai · Oct 6, 2020 · 3418301 · 3418301
1 parent 112b4d2
commit 3418301
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 27 deletions.
diff --git a/jina/drivers/rank.py b/jina/drivers/rank.py
@@ -9,6 +9,7 @@
 from . import BaseExecutableDriver
 from .helper import pb_obj2dict
 from ..proto import uid
+from ..executors.rankers import Chunk2DocRanker, Match2DocRanker
 
 if False:
     from ..proto import jina_pb2
@@ -67,14 +68,25 @@ def _apply_all(self, docs: Iterable['jina_pb2.Document'], context_doc: 'jina_pb2
         match_chunk_meta = {}  # type: Dict[int, Dict]
         for c in docs:
             for match in c.matches:
-                # print(f'{match.parent_id}, {match.id}, {c.id}')
                 match_idx.append(
-                    (self.id2hash(match.parent_id), self.id2hash(match.id), self.id2hash(c.id), match.score.value))
+                    (self.id2hash(match.parent_id),
+                     self.id2hash(match.id),
+                     self.id2hash(c.id),
+                     match.score.value)
+                )
                 query_chunk_meta[self.id2hash(c.id)] = pb_obj2dict(c, self.exec.required_keys)
                 match_chunk_meta[self.id2hash(match.id)] = pb_obj2dict(match, self.exec.required_keys)
 
         if match_idx:
-            match_idx = np.array(match_idx, dtype=np.float64)
+            match_idx = np.array(
+                match_idx,
+                dtype=[
+                    (Chunk2DocRanker.COL_MATCH_PARENT_HASH, np.int64),
+                    (Chunk2DocRanker.COL_MATCH_HASH, np.int64),
+                    (Chunk2DocRanker.COL_DOC_CHUNK_HASH, np.int64),
+                    (Chunk2DocRanker.COL_SCORE, np.float64)
+                ]
+            )
 
             docs_scores = self.exec_fn(match_idx, query_chunk_meta, match_chunk_meta)
             for doc_hash, score in docs_scores:
@@ -132,13 +144,24 @@ def _apply_all(self, docs: Iterable['jina_pb2.Document'], context_doc: 'jina_pb2
         # doc_id_to_match_map = {}
         for match in docs:
             # doc_id_to_match_map[match.id] = index
-            match_idx.append((self.id2hash(match.parent_id), self.id2hash(match.id), self.id2hash(context_doc.id),
-                              match.score.value))
+            match_idx.append((
+                self.id2hash(match.parent_id),
+                self.id2hash(match.id),
+                self.id2hash(context_doc.id),
+                match.score.value
+            ))
             query_chunk_meta[self.id2hash(context_doc.id)] = pb_obj2dict(context_doc, self.exec.required_keys)
             match_chunk_meta[self.id2hash(match.id)] = pb_obj2dict(match, self.exec.required_keys)
 
         if match_idx:
-            match_idx = np.array(match_idx, dtype=np.float64)
+            match_idx = np.array(match_idx,
+                dtype=[
+                    (Chunk2DocRanker.COL_MATCH_PARENT_HASH, np.int64),
+                    (Chunk2DocRanker.COL_MATCH_HASH, np.int64),
+                    (Chunk2DocRanker.COL_DOC_CHUNK_HASH, np.int64),
+                    (Chunk2DocRanker.COL_SCORE, np.float64)
+                ]
+            )
 
             docs_scores = self.exec_fn(match_idx, query_chunk_meta, match_chunk_meta)
             # These ranker will change the current matches
@@ -195,4 +218,5 @@ def _sort_matches_in_place(self, context_doc: 'jina_pb2.Document', match_scores:
             new_match.score.op_name = exec.__class__.__name__
 
     def _sort(self, docs_scores: 'np.ndarray') -> 'np.ndarray':
-        return docs_scores[docs_scores[:, -1].argsort()[::-1]]
+        return np.sort(docs_scores, order=Match2DocRanker.COL_SCORE)[::-1]
+
diff --git a/jina/executors/rankers/__init__.py b/jina/executors/rankers/__init__.py
@@ -31,13 +31,10 @@ class Chunk2DocRanker(BaseRanker):
     """
 
     required_keys = {'text'}  #: a set of ``str``, key-values to extracted from the chunk-level protobuf message
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.col_doc_id = 0
-        self.col_chunk_id = 1
-        self.col_query_chunk_id = 2
-        self.col_score = 3
+    COL_MATCH_PARENT_HASH = 'match_parent_hash'
+    COL_MATCH_HASH = 'match_hash'
+    COL_DOC_CHUNK_HASH = 'doc_chunk_hash'
+    COL_SCORE = 'score'
 
     def score(self, match_idx: 'np.ndarray', query_chunk_meta: Dict, match_chunk_meta: Dict) -> 'np.ndarray':
         """Translate the chunk-level top-k results into doc-level top-k results. Some score functions may leverage the
@@ -69,13 +66,13 @@ def group_by_doc_id(self, match_idx):
         Group the ``match_idx`` by ``doc_id``
         :return: an iterator over the groups
         """
-        return self._group_by(match_idx, self.col_doc_id)
+        return self._group_by(match_idx, self.COL_MATCH_PARENT_HASH)
 
     @staticmethod
-    def _group_by(match_idx, col):
-        # sort by ``col``
-        _sorted_m = match_idx[match_idx[:, col].argsort()]
-        _, _doc_counts = np.unique(_sorted_m[:, col], return_counts=True)
+    def _group_by(match_idx, col_name):
+        # sort by ``col
+        _sorted_m = np.sort(match_idx, order=col_name)
+        _, _doc_counts = np.unique(_sorted_m[col_name], return_counts=True)
         # group by ``col``
         return np.split(_sorted_m, np.cumsum(_doc_counts))[:-1]
 
@@ -88,12 +85,14 @@ def sort_doc_by_score(r):
         Sort a list of (``doc_id``, ``score``) tuples by the ``score``.
         :return: an `np.ndarray` in the shape of [N x 2], where `N` in the length of the input list.
         """
-        r = np.array(r, dtype=np.float64)
-        r = r[r[:, -1].argsort()[::-1]]
-        return r
+        r = np.array(r, dtype=[
+            (Chunk2DocRanker.COL_MATCH_PARENT_HASH, np.int64),
+            (Chunk2DocRanker.COL_SCORE, np.float64)]
+        )
+        return np.sort(r, order=Chunk2DocRanker.COL_SCORE)[::-1]
 
     def get_doc_id(self, match_with_same_doc_id):
-        return match_with_same_doc_id[0, self.col_doc_id]
+        return match_with_same_doc_id[0][self.COL_MATCH_PARENT_HASH]
 
 
 class Match2DocRanker(BaseRanker):
@@ -106,6 +105,9 @@ class Match2DocRanker(BaseRanker):
         - BucketShuffleRanker (first buckets matches and then sort each bucket)
     """
 
+    COL_MATCH_HASH = 'match_hash'
+    COL_SCORE = 'score'
+
     def score(self, query_meta: Dict, old_match_scores: Dict, match_meta: Dict) -> 'np.ndarray':
         """
         This function calculated the new scores for matches and returns them.

diff --git a/jina/hub b/jina/hub
diff --git a/tests/unit/drivers/test_chunk2doc_rank_drivers.py b/tests/unit/drivers/test_chunk2doc_rank_drivers.py
@@ -13,7 +13,7 @@ def __init__(self, *args, **kwargs):
         self.required_keys = {'length'}
 
     def _get_score(self, match_idx, query_chunk_meta, match_chunk_meta, *args, **kwargs):
-        return match_idx[0][self.col_doc_id], match_chunk_meta[match_idx[0][self.col_chunk_id]]['length']
+        return match_idx[0][self.COL_MATCH_PARENT_HASH], match_chunk_meta[match_idx[0][self.COL_MATCH_HASH]]['length']
 
 
 class SimpleChunk2DocRankDriver(Chunk2DocRankDriver):

diff --git a/tests/unit/drivers/test_collect_matches_rank_driver.py b/tests/unit/drivers/test_collect_matches_rank_driver.py
@@ -24,7 +24,7 @@ def __init__(self, *args, **kwargs):
         self.required_keys = {'length'}
 
     def _get_score(self, match_idx, query_chunk_meta, match_chunk_meta, *args, **kwargs):
-        return match_idx[0][self.col_doc_id], match_chunk_meta[match_idx[0][self.col_chunk_id]]['length']
+        return match_idx[0][self.COL_MATCH_PARENT_HASH], match_chunk_meta[match_idx[0][self.COL_MATCH_HASH]]['length']
 
 
 def create_document_to_score_same_depth_level():

diff --git a/tests/unit/drivers/test_matches2doc_rank_drivers.py b/tests/unit/drivers/test_matches2doc_rank_drivers.py
@@ -27,7 +27,11 @@ def score(self, query_meta, old_match_scores, match_meta):
             (match_id, - abs(match_meta[match_id]['length'] - query_meta['length']))
             for match_id, old_score in old_match_scores.items()
         ]
-        return np.array(new_scores, dtype=np.float64)
+
+        return np.array(
+            new_scores,
+            dtype=[(self.COL_MATCH_HASH, np.int64), (self.COL_SCORE, np.float64)],
+        )
 
 
 def create_document_to_score():