Skip to content

Commit

Permalink
fix: back to old nomenclature
Browse files Browse the repository at this point in the history
  • Loading branch information
JoanFM committed Aug 24, 2020
1 parent 2b6f9f1 commit 128b27b
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 19 deletions.
1 change: 0 additions & 1 deletion jina/drivers/rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ def _apply_all(self, docs: Iterable['jina_pb2.Document'], context_doc: 'jina_pb2

# np.uint32 uses 32 bits. np.float32 uses 23 bit mantissa, so integer greater than 2^23 will have their
# least significant bits truncated.

if match_idx:
match_idx = np.array(match_idx, dtype=np.float64)

Expand Down
34 changes: 17 additions & 17 deletions jina/executors/rankers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,42 +34,42 @@ class Chunk2DocRanker(BaseRanker):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.col_parent_id = 0
self.col_doc_id = 1
self.col_query_doc_id = 2
self.col_doc_id = 0
self.col_chunk_id = 1
self.col_query_chunk_id = 2
self.col_score = 3

def score(self, match_idx: 'np.ndarray', query_doc_meta: Dict, match_doc_meta: Dict) -> 'np.ndarray':
def score(self, match_idx: 'np.ndarray', query_chunk_meta: Dict, match_chunk_meta: Dict) -> 'np.ndarray':
"""Translate the chunk-level top-k results into doc-level top-k results. Some score functions may leverage the
meta information of the query, hence the meta info of the query chunks and matched chunks are given
as arguments.
:param match_idx: a [N x 4] numpy ``ndarray``, column-wise:
- ``match_idx[:, 0]``: ``parent_id`` of the matched documents, integer
- ``match_idx[:, 1]``: ``doc_id`` of the matched documents, integer
- ``match_idx[:, 2]``: ``doc_id`` of the query documents, integer
- ``match_idx[:, 0]``: ``doc_id`` of the matched chunks, integer
- ``match_idx[:, 1]``: ``chunk_id`` of the matched chunks, integer
- ``match_idx[:, 2]``: ``chunk_id`` of the query chunks, integer
- ``match_idx[:, 3]``: distance/metric/score between the query and matched chunks, float
:param query_doc_meta: the meta information of the query documents, where the key is query document' ``dod_id``,
:param query_chunk_meta: the meta information of the query chunks, where the key is query chunks' ``chunk_id``,
the value is extracted by the ``required_keys``.
:param match_doc_meta: the meta information of the matched docks, where the key is matched docs'
``doc_id``, the value is extracted by the ``required_keys``.
:param match_chunk_meta: the meta information of the matched chunks, where the key is matched chunks'
``chunk_id``, the value is extracted by the ``required_keys``.
:return: a [N x 2] numpy ``ndarray``, where the first column is the matched documents' ``doc_id`` (integer)
the second column is the score/distance/metric between the matched doc and the query doc (float).
"""
_groups = self.group_by_parent_id(match_idx)
_groups = self.group_by_doc_id(match_idx)
r = []
for _g in _groups:
_doc_id, _doc_score = self._get_score(_g, query_doc_meta, match_doc_meta)
_doc_id, _doc_score = self._get_score(_g, query_chunk_meta, match_chunk_meta)
r.append((_doc_id, _doc_score))
return self.sort_doc_by_score(r)

def group_by_parent_id(self, match_idx):
def group_by_doc_id(self, match_idx):
"""
Group the ``match_idx`` by ``doc_id``
:return: an iterator over the groups
"""
return self._group_by(match_idx, self.col_parent_id)
return self._group_by(match_idx, self.col_doc_id)

@staticmethod
def _group_by(match_idx, col):
Expand All @@ -79,7 +79,7 @@ def _group_by(match_idx, col):
# group by ``col``
return np.split(_sorted_m, np.cumsum(_doc_counts))[:-1]

def _get_score(self, match_idx, query_doc_meta, match_doc_meta, *args, **kwargs):
def _get_score(self, match_idx, query_chunk_meta, match_chunk_meta, *args, **kwargs):
raise NotImplementedError

@staticmethod
Expand All @@ -92,5 +92,5 @@ def sort_doc_by_score(r):
r = r[r[:, -1].argsort()[::-1]]
return r

def get_doc_id(self, match_with_same_parent_id):
return match_with_same_parent_id[0, self.col_parent_id]
def get_doc_id(self, match_with_same_doc_id):
return match_with_same_doc_id[0, self.col_doc_id]
2 changes: 1 addition & 1 deletion tests/unit/drivers/test_chunk2doc_rank_drivers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def __init__(self, *args, **kwargs):
self.required_keys = {'length'}

def _get_score(self, match_idx, query_chunk_meta, match_chunk_meta, *args, **kwargs):
return match_idx[0][self.col_parent_id], match_chunk_meta[match_idx[0][self.col_doc_id]]['length']
return match_idx[0][self.col_doc_id], match_chunk_meta[match_idx[0][self.col_chunk_id]]['length']


class SimpleChunk2DocRankDriver(Chunk2DocRankDriver):
Expand Down

0 comments on commit 128b27b

Please sign in to comment.