Skip to content

Commit

Permalink
fix: fixed the int float conversion collision between hash and idy (#…
Browse files Browse the repository at this point in the history
…1034)

* fix: fixed the int float conversion collision between hash and idy

* fix: updated jina hub submodule
  • Loading branch information
maximilianwerk committed Oct 6, 2020
1 parent 112b4d2 commit 3418301
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 27 deletions.
38 changes: 31 additions & 7 deletions jina/drivers/rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from . import BaseExecutableDriver
from .helper import pb_obj2dict
from ..proto import uid
from ..executors.rankers import Chunk2DocRanker, Match2DocRanker

if False:
from ..proto import jina_pb2
Expand Down Expand Up @@ -67,14 +68,25 @@ def _apply_all(self, docs: Iterable['jina_pb2.Document'], context_doc: 'jina_pb2
match_chunk_meta = {} # type: Dict[int, Dict]
for c in docs:
for match in c.matches:
# print(f'{match.parent_id}, {match.id}, {c.id}')
match_idx.append(
(self.id2hash(match.parent_id), self.id2hash(match.id), self.id2hash(c.id), match.score.value))
(self.id2hash(match.parent_id),
self.id2hash(match.id),
self.id2hash(c.id),
match.score.value)
)
query_chunk_meta[self.id2hash(c.id)] = pb_obj2dict(c, self.exec.required_keys)
match_chunk_meta[self.id2hash(match.id)] = pb_obj2dict(match, self.exec.required_keys)

if match_idx:
match_idx = np.array(match_idx, dtype=np.float64)
match_idx = np.array(
match_idx,
dtype=[
(Chunk2DocRanker.COL_MATCH_PARENT_HASH, np.int64),
(Chunk2DocRanker.COL_MATCH_HASH, np.int64),
(Chunk2DocRanker.COL_DOC_CHUNK_HASH, np.int64),
(Chunk2DocRanker.COL_SCORE, np.float64)
]
)

docs_scores = self.exec_fn(match_idx, query_chunk_meta, match_chunk_meta)
for doc_hash, score in docs_scores:
Expand Down Expand Up @@ -132,13 +144,24 @@ def _apply_all(self, docs: Iterable['jina_pb2.Document'], context_doc: 'jina_pb2
# doc_id_to_match_map = {}
for match in docs:
# doc_id_to_match_map[match.id] = index
match_idx.append((self.id2hash(match.parent_id), self.id2hash(match.id), self.id2hash(context_doc.id),
match.score.value))
match_idx.append((
self.id2hash(match.parent_id),
self.id2hash(match.id),
self.id2hash(context_doc.id),
match.score.value
))
query_chunk_meta[self.id2hash(context_doc.id)] = pb_obj2dict(context_doc, self.exec.required_keys)
match_chunk_meta[self.id2hash(match.id)] = pb_obj2dict(match, self.exec.required_keys)

if match_idx:
match_idx = np.array(match_idx, dtype=np.float64)
match_idx = np.array(match_idx,
dtype=[
(Chunk2DocRanker.COL_MATCH_PARENT_HASH, np.int64),
(Chunk2DocRanker.COL_MATCH_HASH, np.int64),
(Chunk2DocRanker.COL_DOC_CHUNK_HASH, np.int64),
(Chunk2DocRanker.COL_SCORE, np.float64)
]
)

docs_scores = self.exec_fn(match_idx, query_chunk_meta, match_chunk_meta)
# These ranker will change the current matches
Expand Down Expand Up @@ -195,4 +218,5 @@ def _sort_matches_in_place(self, context_doc: 'jina_pb2.Document', match_scores:
new_match.score.op_name = exec.__class__.__name__

def _sort(self, docs_scores: 'np.ndarray') -> 'np.ndarray':
return docs_scores[docs_scores[:, -1].argsort()[::-1]]
return np.sort(docs_scores, order=Match2DocRanker.COL_SCORE)[::-1]

34 changes: 18 additions & 16 deletions jina/executors/rankers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,10 @@ class Chunk2DocRanker(BaseRanker):
"""

required_keys = {'text'} #: a set of ``str``, key-values to extracted from the chunk-level protobuf message

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.col_doc_id = 0
self.col_chunk_id = 1
self.col_query_chunk_id = 2
self.col_score = 3
COL_MATCH_PARENT_HASH = 'match_parent_hash'
COL_MATCH_HASH = 'match_hash'
COL_DOC_CHUNK_HASH = 'doc_chunk_hash'
COL_SCORE = 'score'

def score(self, match_idx: 'np.ndarray', query_chunk_meta: Dict, match_chunk_meta: Dict) -> 'np.ndarray':
"""Translate the chunk-level top-k results into doc-level top-k results. Some score functions may leverage the
Expand Down Expand Up @@ -69,13 +66,13 @@ def group_by_doc_id(self, match_idx):
Group the ``match_idx`` by ``doc_id``
:return: an iterator over the groups
"""
return self._group_by(match_idx, self.col_doc_id)
return self._group_by(match_idx, self.COL_MATCH_PARENT_HASH)

@staticmethod
def _group_by(match_idx, col):
# sort by ``col``
_sorted_m = match_idx[match_idx[:, col].argsort()]
_, _doc_counts = np.unique(_sorted_m[:, col], return_counts=True)
def _group_by(match_idx, col_name):
# sort by ``col
_sorted_m = np.sort(match_idx, order=col_name)
_, _doc_counts = np.unique(_sorted_m[col_name], return_counts=True)
# group by ``col``
return np.split(_sorted_m, np.cumsum(_doc_counts))[:-1]

Expand All @@ -88,12 +85,14 @@ def sort_doc_by_score(r):
Sort a list of (``doc_id``, ``score``) tuples by the ``score``.
:return: an `np.ndarray` in the shape of [N x 2], where `N` in the length of the input list.
"""
r = np.array(r, dtype=np.float64)
r = r[r[:, -1].argsort()[::-1]]
return r
r = np.array(r, dtype=[
(Chunk2DocRanker.COL_MATCH_PARENT_HASH, np.int64),
(Chunk2DocRanker.COL_SCORE, np.float64)]
)
return np.sort(r, order=Chunk2DocRanker.COL_SCORE)[::-1]

def get_doc_id(self, match_with_same_doc_id):
return match_with_same_doc_id[0, self.col_doc_id]
return match_with_same_doc_id[0][self.COL_MATCH_PARENT_HASH]


class Match2DocRanker(BaseRanker):
Expand All @@ -106,6 +105,9 @@ class Match2DocRanker(BaseRanker):
- BucketShuffleRanker (first buckets matches and then sort each bucket)
"""

COL_MATCH_HASH = 'match_hash'
COL_SCORE = 'score'

def score(self, query_meta: Dict, old_match_scores: Dict, match_meta: Dict) -> 'np.ndarray':
"""
This function calculated the new scores for matches and returns them.
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/drivers/test_chunk2doc_rank_drivers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def __init__(self, *args, **kwargs):
self.required_keys = {'length'}

def _get_score(self, match_idx, query_chunk_meta, match_chunk_meta, *args, **kwargs):
return match_idx[0][self.col_doc_id], match_chunk_meta[match_idx[0][self.col_chunk_id]]['length']
return match_idx[0][self.COL_MATCH_PARENT_HASH], match_chunk_meta[match_idx[0][self.COL_MATCH_HASH]]['length']


class SimpleChunk2DocRankDriver(Chunk2DocRankDriver):
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/drivers/test_collect_matches_rank_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, *args, **kwargs):
self.required_keys = {'length'}

def _get_score(self, match_idx, query_chunk_meta, match_chunk_meta, *args, **kwargs):
return match_idx[0][self.col_doc_id], match_chunk_meta[match_idx[0][self.col_chunk_id]]['length']
return match_idx[0][self.COL_MATCH_PARENT_HASH], match_chunk_meta[match_idx[0][self.COL_MATCH_HASH]]['length']


def create_document_to_score_same_depth_level():
Expand Down
6 changes: 5 additions & 1 deletion tests/unit/drivers/test_matches2doc_rank_drivers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ def score(self, query_meta, old_match_scores, match_meta):
(match_id, - abs(match_meta[match_id]['length'] - query_meta['length']))
for match_id, old_score in old_match_scores.items()
]
return np.array(new_scores, dtype=np.float64)

return np.array(
new_scores,
dtype=[(self.COL_MATCH_HASH, np.int64), (self.COL_SCORE, np.float64)],
)


def create_document_to_score():
Expand Down

0 comments on commit 3418301

Please sign in to comment.