Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: type annotation #2049

Merged
merged 12 commits into from
Mar 9, 2021
40 changes: 29 additions & 11 deletions jina/clients/sugary_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,49 +26,62 @@ def _sample(iterable, sampling_rate: float = None):
yield i


def _subsample(iterable, sampling_rate: float = None, size: int = None):
def _subsample(iterable, size: int = None, sampling_rate: float = None):
yield from it.islice(_sample(iterable, sampling_rate), size)


def _input_lines(
lines: Iterable[str] = None,
filepath: str = None,
florian-hoenicke marked this conversation as resolved.
Show resolved Hide resolved
line_format: str = 'json',
field_resolver: Dict[str, str] = None,
size: int = None,
sampling_rate: float = None,
) -> Generator[Union[str, 'Document'], None, None]:
"""Generator function for lines, json and sc. Yields documents or strings.

:param filepath: a text file that each line contains a document
:param lines: a list of strings, each is considered as a document
:param filepath: a text file that each line contains a document
:param line_format: the format of each line ``json`` or ``csv``
:param field_resolver: a map from field names defined in ``document`` (JSON, dict) to the field
names defined in Protobuf. This is only used when the given ``document`` is
a JSON string or a Python dict.
:param size: the maximum number of the documents
:param sampling_rate: the sampling rate between [0, 1]
:yields: documents

.. note::
This function should not be directly used, use :meth:`Flow.index_lines`, :meth:`Flow.search_lines` instead
This function should not be directly used, use :meth:`Flow.index_files`, :meth:`Flow.search_files` instead
"""
if filepath:
file_type = os.path.splitext(filepath)[1]
with open(filepath, 'r') as f:
if file_type in _jsonl_ext:
yield from _input_ndjson(f)
elif file_type in _csv_ext:
yield from _input_csv(f)
yield from _input_csv(f, field_resolver, size, sampling_rate)
else:
yield from _subsample(f)
yield from _subsample(f, size, sampling_rate)
elif lines:
if line_format == 'json':
yield from _input_ndjson(lines)
elif line_format == 'csv':
yield from _input_csv(lines)
yield from _input_csv(lines, field_resolver, size, sampling_rate)
else:
yield from _subsample(lines)
yield from _subsample(lines, size, sampling_rate)
else:
raise ValueError('"filepath" and "lines" can not be both empty')


def _input_ndjson(fp: Iterable[str], field_resolver: Dict[str, str] = None):
def _input_ndjson(
fp: Iterable[str],
field_resolver: Dict[str, str] = None,
florian-hoenicke marked this conversation as resolved.
Show resolved Hide resolved
size: int = None,
sampling_rate: float = None,
):
from jina import Document

for line in _subsample(fp):
for line in _subsample(fp, size, sampling_rate):
value = json.loads(line)
if 'groundtruth' in value and 'document' in value:
yield Document(value['document'], field_resolver), Document(
Expand All @@ -78,11 +91,16 @@ def _input_ndjson(fp: Iterable[str], field_resolver: Dict[str, str] = None):
yield Document(value, field_resolver)


def _input_csv(fp: Iterable[str], field_resolver: Dict[str, str] = None):
def _input_csv(
fp: Iterable[str],
field_resolver: Dict[str, str] = None,
size: int = None,
sampling_rate: float = None,
):
from jina import Document

lines = csv.DictReader(fp)
for value in _subsample(lines):
for value in _subsample(lines, size, sampling_rate):
if 'groundtruth' in value and 'document' in value:
yield Document(value['document'], field_resolver), Document(
value['groundtruth'], field_resolver
Expand Down