Skip to content

Commit

Permalink
perf: create DocumentArray index map lazily (#3944)
Browse files Browse the repository at this point in the history
  • Loading branch information
alaeddine-13 committed Nov 18, 2021
1 parent 1395c26 commit ea5519a
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 19 deletions.
34 changes: 24 additions & 10 deletions jina/types/arrays/document.py
Expand Up @@ -83,7 +83,17 @@ def __init__(self, docs: Optional[DocumentArraySourceType] = None):
raise ValueError(
f'DocumentArray got an unexpected input {type(docs)}'
)
self._update_id_to_index_map()
self._id_to_index = None

@property
def _index_map(self) -> Dict:
"""Return the `_id_to_index` map
:return: a Python dict.
"""
if not self._id_to_index:
self._update_id_to_index_map()
return self._id_to_index

def _update_id_to_index_map(self):
"""Update the id_to_index map by enumerating all Documents in self._pb_body.
Expand All @@ -102,23 +112,25 @@ def insert(self, index: int, doc: 'Document') -> None:
:param doc: The doc needs to be inserted.
"""
self._pb_body.insert(index, doc.proto)
self._id_to_index[doc.id] = index
if self._id_to_index:
self._id_to_index[doc.id] = index

def __setitem__(self, key, value: 'Document'):
if isinstance(key, int):
self[key].CopyFrom(value)
self._id_to_index[value.id] = key
if self._id_to_index:
self._id_to_index[value.id] = key
elif isinstance(key, str):
self[self._id_to_index[key]].CopyFrom(value)
self[self._index_map[key]].CopyFrom(value)
else:
raise IndexError(f'do not support this index {key}')

def __delitem__(self, index: Union[int, str, slice]):
if isinstance(index, int):
del self._pb_body[index]
elif isinstance(index, str):
del self[self._id_to_index[index]]
self._id_to_index.pop(index)
del self[self._index_map[index]]
self._index_map.pop(index)
elif isinstance(index, slice):
del self._pb_body[index]
else:
Expand All @@ -141,13 +153,13 @@ def __iter__(self) -> Iterator['Document']:
yield Document(d)

def __contains__(self, item: str):
return item in self._id_to_index
return item in self._index_map

def __getitem__(self, item: Union[int, str, slice, List]):
if isinstance(item, int):
return Document(self._pb_body[item])
elif isinstance(item, str):
return self[self._id_to_index[item]]
return self[self._index_map[item]]
elif isinstance(item, slice):
return DocumentArray(self._pb_body[item])
elif isinstance(item, list):
Expand All @@ -161,7 +173,8 @@ def append(self, doc: 'Document'):
:param doc: The doc needs to be appended.
"""
self._id_to_index[doc.id] = len(self._pb_body)
if self._id_to_index:
self._id_to_index[doc.id] = len(self._pb_body)
self._pb_body.append(doc.proto)

def extend(self, docs: Iterable['Document']) -> None:
Expand All @@ -179,7 +192,8 @@ def extend(self, docs: Iterable['Document']) -> None:
def clear(self):
"""Clear the data of :class:`DocumentArray`"""
del self._pb_body[:]
self._id_to_index.clear()
if self._id_to_index:
self._id_to_index.clear()

def reverse(self):
"""In-place reverse the sequence."""
Expand Down
18 changes: 9 additions & 9 deletions jina/types/document/graph.py
Expand Up @@ -133,7 +133,7 @@ def remove_single_node(self, node: Union['Document', str]):
)
return

offset = self._nodes._id_to_index[node_id]
offset = self._nodes._index_map[node_id]

if self.num_edges > 0:
nodes = self._nodes
Expand Down Expand Up @@ -234,8 +234,8 @@ def add_single_edge(
source_id = doc2_id
target_id = doc1_id

source_node_offset = np.array([self._nodes._id_to_index[source_id]])
target_node_offset = np.array([self._nodes._id_to_index[target_id]])
source_node_offset = np.array([self._nodes._index_map[source_id]])
target_node_offset = np.array([self._nodes._index_map[target_id]])

if current_adjacency is None:
row = source_node_offset
Expand Down Expand Up @@ -313,13 +313,13 @@ def add_edges(
current_adjacency = self.adjacency
source_node_offsets = np.array(
[
self._nodes._id_to_index[source.id if is_documents_source else source]
self._nodes._index_map[source.id if is_documents_source else source]
for source in source_docs
]
)
target_node_offsets = np.array(
[
self._nodes._id_to_index[target.id if is_documents_dest else target]
self._nodes._index_map[target.id if is_documents_dest else target]
for target in dest_docs
]
)
Expand Down Expand Up @@ -373,8 +373,8 @@ def remove_single_edge(
"""
doc1_id = doc1.id if isinstance(doc1, Document) else doc1
doc2_id = doc2.id if isinstance(doc2, Document) else doc2
offset1 = self._nodes._id_to_index[doc1_id]
offset2 = self._nodes._id_to_index[doc2_id]
offset1 = self._nodes._index_map[doc1_id]
offset2 = self._nodes._index_map[doc2_id]
for edge_id, (row, col) in enumerate(
zip(self.adjacency.row, self.adjacency.col)
):
Expand Down Expand Up @@ -484,7 +484,7 @@ def get_outgoing_nodes(self, doc: 'Document') -> Optional[ChunkArray]:
:param doc: the document node from which to extract the outgoing nodes.
"""
if self.adjacency is not None and doc.id in self._nodes:
offset = self._nodes._id_to_index[doc.id]
offset = self._nodes._index_map[doc.id]
return ChunkArray(
[
self._nodes[col.item()]
Expand All @@ -502,7 +502,7 @@ def get_incoming_nodes(self, doc: 'Document') -> Optional[ChunkArray]:
:param doc: the document node from which to extract the incoming nodes.
"""
if self.adjacency is not None and doc.id in self._nodes:
offset = self._nodes._id_to_index[doc.id]
offset = self._nodes._index_map[doc.id]
return ChunkArray(
[
self._nodes[row.item()]
Expand Down
10 changes: 10 additions & 0 deletions tests/unit/types/arrays/documentarray/test_documentarray.py
Expand Up @@ -344,3 +344,13 @@ def test_none_extend():
da = DocumentArray([Document() for _ in range(100)])
da.extend(None)
assert len(da) == 100


def test_lazy_index_map():
da = DocumentArray([Document(id=str(i), text=f'document_{i}') for i in range(100)])
assert da._id_to_index is None

# build index map
assert da['0'].text == 'document_0'
assert da._id_to_index is not None
assert len(da._index_map.keys()) == 100

0 comments on commit ea5519a

Please sign in to comment.