Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: remove unique id #1872

Merged
merged 9 commits into from
Feb 5, 2021
3 changes: 2 additions & 1 deletion jina/clients/request/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,5 @@ def _add_docs_groundtruths(req, batch, data_type, _kwargs):


def _add_ids(req, batch):
req.ids.extend(batch)
string_ids = (str(doc_id) for doc_id in batch)
florian-hoenicke marked this conversation as resolved.
Show resolved Hide resolved
req.ids.extend(string_ids)
3 changes: 1 addition & 2 deletions jina/drivers/rank/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from .. import BaseExecutableDriver
from ...types.document import Document
from ...types.document.uid import UniqueId
from ...types.score import NamedScore


Expand Down Expand Up @@ -66,7 +65,7 @@ def _sort_matches_in_place(self, context_doc: 'Document', match_scores: 'np.ndar
cm = context_doc.matches
cm.build()
for str_match_id, score in match_scores:
match_id = UniqueId(str_match_id)
match_id = str_match_id
florian-hoenicke marked this conversation as resolved.
Show resolved Hide resolved
cm[match_id].score = NamedScore(value=score, op_name=op_name, ref_id=context_doc.id)

cm.sort(key=lambda x: x.score.value, reverse=True)
7 changes: 3 additions & 4 deletions jina/drivers/rank/aggregate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from .. import BaseRankDriver

if False:
from ....types.document.uid import UniqueId
from ....types.sets import DocumentSet


Expand Down Expand Up @@ -105,9 +104,9 @@ def _apply_all(self, docs: 'DocumentSet',
:return:
"""

match_idx = [] # type: List[Tuple[UniqueId, UniqueId, UniqueId, float]]
query_meta = {} # type: Dict[UniqueId, Dict]
match_meta = {} # type: Dict[UniqueId, Dict]
match_idx = [] # type: List[Tuple[str, str, str, float]]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are these types assigned behind a comment?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but why are they comments in this case? They only reason to do that is if we want to support Python 2. Anyway, not a big deal

query_meta = {} # type: Dict[str, Dict]
match_meta = {} # type: Dict[str, Dict]
parent_id_chunk_id_map = defaultdict(list)
matches_by_id = defaultdict(Document)
for chunk in docs:
Expand Down
23 changes: 6 additions & 17 deletions jina/types/document/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from google.protobuf.field_mask_pb2 import FieldMask

from .converters import png_to_buffer, to_datauri, guess_mime
from .uid import DIGEST_SIZE, UniqueId
from ..mixin import ProtoTypeMixin
from ..ndarray.generic import NdArray
from ..score import NamedScore
Expand All @@ -24,6 +23,7 @@
from ...proto import jina_pb2

__all__ = ['Document', 'DocumentContentType', 'DocumentSourceType']
DIGEST_SIZE = 8

DocumentContentType = TypeVar('DocumentContentType', bytes, str, np.ndarray)
DocumentSourceType = TypeVar('DocumentSourceType',
Expand Down Expand Up @@ -226,24 +226,13 @@ def parent_id(self) -> str:

@id.setter
def id(self, value: Union[bytes, str, int]):
"""Set document id to a string value
"""Set document id to a string value.

.. note:

Customized ``id`` is acceptable as long as
- it only contains the symbols "0"–"9" to represent values 0 to 9,
and "A"–"F" (or alternatively "a"–"f").
- it has 16 chars described above.

:param value: restricted string value
:param value: id as bytes, int or str
:return:
"""
if isinstance(value, str):
self._pb_body.id = value
else:
warnings.warn(f'expecting a string as ID, receiving {type(value)}. '
f'Note this type will be deprecated soon', DeprecationWarning)
self._pb_body.id = UniqueId(value)
self._pb_body.id = str(value)


@parent_id.setter
def parent_id(self, value: Union[bytes, str, int]):
Expand All @@ -264,7 +253,7 @@ def parent_id(self, value: Union[bytes, str, int]):
else:
warnings.warn(f'expecting a string as ID, receiving {type(value)}. '
f'Note this type will be deprecated soon', DeprecationWarning)
self._pb_body.parent_id = UniqueId(value)
self._pb_body.parent_id = value

@property
def blob(self) -> 'np.ndarray':
Expand Down
95 changes: 0 additions & 95 deletions jina/types/document/uid.py

This file was deleted.

6 changes: 0 additions & 6 deletions jina/types/sets/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,8 @@ def insert(self, index: int, doc: 'Document') -> None:
self._docs_proto.insert(index, doc.proto)

def __setitem__(self, key, value: 'Document'):
from ..document.uid import UniqueId
if isinstance(key, int):
self._docs_proto[key].CopyFrom(value)
elif isinstance(key, UniqueId):
self._docs_map[str(key)].CopyFrom(value)
elif isinstance(key, str):
self._docs_map[key].CopyFrom(value)
else:
Expand All @@ -59,11 +56,8 @@ def __iter__(self):

def __getitem__(self, item):
from ..document import Document
from ..document.uid import UniqueId
if isinstance(item, int):
return Document(self._docs_proto[item])
elif isinstance(item, UniqueId):
return Document(self._docs_map[str(item)])
elif isinstance(item, str):
return Document(self._docs_map[item])
elif isinstance(item, slice):
Expand Down
17 changes: 12 additions & 5 deletions tests/integration/crud/simple/test_crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,19 +29,26 @@ def config(tmpdir):
def random_docs(start, end, embed_dim=10, jitter=1, has_content=True):
for j in range(start, end):
d = Document()
d.id = str(f'{j}' * 16)
d.id = j
if has_content:
d.tags['id'] = j
d.text = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)).encode('utf8')
d.embedding = np.random.random([embed_dim + np.random.randint(0, jitter)])
yield d


def get_ids_to_delete(start, end, as_string):
if as_string:
return (str(idx) for idx in range(start, end))
return range(start, end)


def validate_index_size(num_indexed_docs, compound=False):
from jina.executors.compound import CompoundExecutor

if compound:
path = Path(CompoundExecutor.get_component_workspace_from_compound_workspace(os.environ['JINA_TOPK_DIR'], 'chunk_indexer', 0))
path = Path(CompoundExecutor.get_component_workspace_from_compound_workspace(os.environ['JINA_TOPK_DIR'],
'chunk_indexer', 0))
else:
path = Path(os.environ['JINA_TOPK_DIR'])
bin_files = list(path.glob('*.bin'))
Expand Down Expand Up @@ -96,8 +103,8 @@ def validate_results(resp):
mock.assert_called_once()


@pytest.mark.parametrize('has_content', [True, False])
def test_delete_kv(config, mocker, has_content):
@pytest.mark.parametrize('as_string', [True, False])
def test_delete_kv(config, mocker, as_string):
flow_file = 'flow_kv.yml'

def validate_result_factory(num_matches):
Expand All @@ -117,7 +124,7 @@ def validate_results(resp):
mock.assert_called_once()

with Flow.load_config(flow_file) as index_flow:
index_flow.delete(input_fn=[d.id for d in random_docs(0, 3, has_content=has_content)])
index_flow.delete(input_fn=get_ids_to_delete(0, 3, as_string))
validate_index_size(7)

mock = mocker.Mock()
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/drivers/test_cache_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from jina.executors import BaseExecutor
from jina.executors.indexers.cache import DocCache, ID_KEY, CONTENT_HASH_KEY
from jina.proto import jina_pb2
from jina.types.document import Document, UniqueId
from jina.types.document import Document
from tests import random_docs


Expand Down Expand Up @@ -146,7 +146,7 @@ def test_cache_content_driver_same_content(tmpdir, test_metas):
doc1.text = new_string
doc1.update_content_hash()
with BaseExecutor.load(filename) as executor:
executor.update([UniqueId(1)], [doc1.content_hash])
executor.update([1], [doc1.content_hash])

with BaseExecutor.load(filename) as executor:
assert executor.query(doc1.content_hash) is True
Expand Down
5 changes: 2 additions & 3 deletions tests/unit/drivers/test_concat_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from jina import Document
from jina.flow import Flow
from jina.types.document.uid import UniqueId
from jina.types.ndarray.generic import NdArray

e1 = np.random.random([7])
Expand All @@ -18,13 +17,13 @@ def input_fn():
doc1.embedding = e1
with Document() as chunk1:
chunk1.embedding = e2
chunk1.id = UniqueId(1)
chunk1.id = 1
doc1.chunks.add(chunk1)
with Document() as doc2:
doc2.embedding = e3
with Document() as chunk2:
chunk2.embedding = e4
chunk2.id = UniqueId(2)
chunk2.id = 2
doc2.chunks.add(chunk2)
return [doc1, doc2]

Expand Down
5 changes: 2 additions & 3 deletions tests/unit/flow/test_flow_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from jina.flow import Flow
from jina.proto import jina_pb2
from jina.types.document.uid import UniqueId
from tests import random_docs, rm_files

cur_dir = os.path.dirname(os.path.abspath(__file__))
Expand All @@ -14,10 +13,10 @@
def random_queries(num_docs, chunks_per_doc=5):
for j in range(num_docs):
d = jina_pb2.DocumentProto()
d.id = UniqueId(j)
d.id = j
for k in range(chunks_per_doc):
dd = d.chunks.add()
dd.id = UniqueId(num_docs + j * chunks_per_doc + k)
dd.id = num_docs + j * chunks_per_doc + k
yield d


Expand Down
7 changes: 3 additions & 4 deletions tests/unit/flow/test_flow_multimode.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from jina.executors.indexers.keyvalue import BinaryPbIndexer
from jina.flow import Flow
from jina.proto import jina_pb2
from jina.types.document.uid import UniqueId

cur_dir = os.path.dirname(os.path.abspath(__file__))

Expand Down Expand Up @@ -43,15 +42,15 @@ def test_flow_with_modalities(tmpdir, restful):
def input_fn():
doc1 = jina_pb2.DocumentProto()
doc1.text = 'title: this is mode1 from doc1, body: this is mode2 from doc1'
doc1.id = UniqueId(1)
doc1.id = 1

doc2 = jina_pb2.DocumentProto()
doc2.text = 'title: this is mode1 from doc2, body: this is mode2 from doc2'
doc2.id = UniqueId(2)
doc2.id = 2

doc3 = jina_pb2.DocumentProto()
doc3.text = 'title: this is mode1 from doc3, body: this is mode2 from doc3'
doc3.id = UniqueId(3)
doc3.id = 3

return [doc1, doc2, doc3]

Expand Down
8 changes: 1 addition & 7 deletions tests/unit/test_helper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import random
import time
from types import SimpleNamespace

Expand All @@ -14,8 +13,8 @@
from jina.logging import default_logger
from jina.logging.profile import TimeContext
from jina.proto import jina_pb2
from jina.types.document.uid import *
from tests import random_docs
import numpy as np


def test_cached_property():
Expand Down Expand Up @@ -60,11 +59,6 @@ def test_time_context():
assert tc.readable_duration == '2 seconds'


def test_np_int():
a = random.randint(0, 100000)
assert int2bytes(np.int64(a)) == int2bytes(a)


def test_dunder_get():
a = SimpleNamespace()
a.b = {'c': 1}
Expand Down