Skip to content

Commit

Permalink
f-string formatting (#3277)
Browse files Browse the repository at this point in the history
* f-string formatting

* f-string formatting on src/

* f-string formatting last modules

* make style on edited files

* Update tests/test_dataset_dict.py

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
  • Loading branch information
Mehdi2402 and lhoestq committed Nov 17, 2021
1 parent 293d5d5 commit 3edbeb0
Show file tree
Hide file tree
Showing 39 changed files with 214 additions and 290 deletions.
6 changes: 3 additions & 3 deletions benchmarks/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ def format_json_to_md(input_json_file, output_md_file):
old_val = metric_vals.get("old", None)
dif_val = metric_vals.get("diff", None)

val_str = " {:f}".format(new_val) if isinstance(new_val, (int, float)) else "None"
val_str = f" {new_val:f}" if isinstance(new_val, (int, float)) else "None"

if old_val is not None:
val_str += " / {:f}".format(old_val) if isinstance(old_val, (int, float)) else "None"
val_str += f" / {new_val:f}" if isinstance(old_val, (int, float)) else "None"
if dif_val is not None:
val_str += " ({:f})".format(dif_val) if isinstance(dif_val, (int, float)) else "None"
val_str += f" ({dif_val:f})" if isinstance(dif_val, (int, float)) else "None"

title += " " + metric_name + " |"
lines += "---|"
Expand Down
16 changes: 8 additions & 8 deletions metrics/coval/coval.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,17 +210,17 @@ def get_coref_infos(
if remove_nested:
logger.info(
"Number of removed nested coreferring mentions in the key "
"annotation: %s; and system annotation: %s" % (key_nested_coref_num, sys_nested_coref_num)
f"annotation: {key_nested_coref_num}; and system annotation: {sys_nested_coref_num}"
)
logger.info(
"Number of resulting singleton clusters in the key "
"annotation: %s; and system annotation: %s" % (key_removed_nested_clusters, sys_removed_nested_clusters)
f"annotation: {key_removed_nested_clusters}; and system annotation: {sys_removed_nested_clusters}"
)

if not keep_singletons:
logger.info(
"%d and %d singletons are removed from the key and system "
"files, respectively" % (key_singletons_num, sys_singletons_num)
f"{key_singletons_num:d} and {sys_singletons_num:d} singletons are removed from the key and system "
"files, respectively"
)

return doc_coref_infos
Expand All @@ -242,14 +242,14 @@ def evaluate(key_lines, sys_lines, metrics, NP_only, remove_nested, keep_singlet

logger.info(
name.ljust(10),
"Recall: %.2f" % (recall * 100),
" Precision: %.2f" % (precision * 100),
" F1: %.2f" % (f1 * 100),
f"Recall: {recall * 100:.2f}",
f" Precision: {precision * 100:.2f}",
f" F1: {f1 * 100:.2f}",
)

if conll_subparts_num == 3:
conll = (conll / 3) * 100
logger.info("CoNLL score: %.2f" % conll)
logger.info(f"CoNLL score: {conll:.2f}")
output_scores.update({"conll_score": conll})

return output_scores
Expand Down
8 changes: 4 additions & 4 deletions metrics/squad_v2/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def get_raw_scores(dataset, preds):
# For unanswerable questions, only correct answer is empty string
gold_answers = [""]
if qid not in preds:
print("Missing prediction for %s" % qid)
print(f"Missing prediction for {qid}")
continue
a_pred = preds[qid]
# Take max over all gold answers
Expand Down Expand Up @@ -156,7 +156,7 @@ def make_eval_dict(exact_scores, f1_scores, qid_list=None):

def merge_eval(main_eval, new_eval, prefix):
for k in new_eval:
main_eval["%s_%s" % (prefix, k)] = new_eval[k]
main_eval[f"{prefix}_{k}"] = new_eval[k]


def plot_pr_curve(precisions, recalls, out_image, title):
Expand Down Expand Up @@ -238,8 +238,8 @@ def histogram_na_prob(na_probs, qid_list, image_dir, name):
plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
plt.xlabel("Model probability of no-answer")
plt.ylabel("Proportion of dataset")
plt.title("Histogram of no-answer probability: %s" % name)
plt.savefig(os.path.join(image_dir, "na_prob_hist_%s.png" % name))
plt.title(f"Histogram of no-answer probability: {name}")
plt.savefig(os.path.join(image_dir, f"na_prob_hist_{name}.png"))
plt.clf()


Expand Down
6 changes: 3 additions & 3 deletions metrics/super_glue/record_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def evaluate(dataset, predictions):
for qa in passage["qas"]:
total += 1
if qa["id"] not in predictions:
message = "Unanswered question {} will receive score 0.".format(qa["id"])
message = f'Unanswered question {qa["id"]} will receive score 0.'
print(message, file=sys.stderr)
continue

Expand Down Expand Up @@ -95,7 +95,7 @@ def evaluate(dataset, predictions):
dataset_json = json.load(data_file)
if dataset_json["version"] != expected_version:
print(
"Evaluation expects v-{}, but got dataset with v-{}".format(expected_version, dataset_json["version"]),
f'Evaluation expects v-{expected_version}, but got dataset with v-{dataset_json["version"]}',
file=sys.stderr,
)
dataset = dataset_json["data"]
Expand All @@ -106,6 +106,6 @@ def evaluate(dataset, predictions):
metrics, correct_ids = evaluate(dataset, predictions)

if args.output_correct_ids:
print("Output {} correctly answered question IDs.".format(len(correct_ids)))
print(f"Output {len(correct_ids)} correctly answered question IDs.")
with open("correct_ids.json", "w") as f:
json.dump(correct_ids, f)
2 changes: 1 addition & 1 deletion metrics/super_glue/super_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def evaluate_multirc(ids_preds, labels):
"""
question_map = {}
for id_pred, label in zip(ids_preds, labels):
question_id = "{}-{}".format(id_pred["idx"]["paragraph"], id_pred["idx"]["question"])
question_id = f'{id_pred["idx"]["paragraph"]}-{id_pred["idx"]["question"]}'
pred = id_pred["prediction"]
if question_id in question_map:
question_map[question_id].append((pred, label))
Expand Down
68 changes: 21 additions & 47 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,9 +621,7 @@ def __init__(
assert self._fingerprint is not None, "Fingerprint can't be None in a Dataset object"
if self.info.features.type != inferred_features.type:
raise ValueError(
"External features info don't match the dataset:\nGot\n{}\nwith type\n{}\n\nbut expected something like\n{}\nwith type\n{}".format(
self.info.features, self.info.features.type, inferred_features, inferred_features.type
)
f"External features info don't match the dataset:\nGot\n{self.info.features}\nwith type\n{self.info.features.type}\n\nbut expected something like\n{inferred_features}\nwith type\n{inferred_features.type}"
)

if self._indices is not None:
Expand Down Expand Up @@ -735,9 +733,7 @@ def from_pandas(
"""
if info is not None and features is not None and info.features != features:
raise ValueError(
"Features specified in `features` and `info.features` can't be different:\n{}\n{}".format(
features, info.features
)
f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
)
features = features if features is not None else info.features if info is not None else None
if info is None:
Expand Down Expand Up @@ -768,9 +764,7 @@ def from_dict(
"""
if info is not None and features is not None and info.features != features:
raise ValueError(
"Features specified in `features` and `info.features` can't be different:\n{}\n{}".format(
features, info.features
)
f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
)
features = features if features is not None else info.features if info is not None else None
if info is None:
Expand Down Expand Up @@ -1008,7 +1002,7 @@ def save_to_disk(self, dataset_path: str, fs=None):
# Sort only the first level of keys, or we might shuffle fields of nested features if we use sort_keys=True
sorted_keys_dataset_info = {key: dataset_info[key] for key in sorted(dataset_info)}
json.dump(sorted_keys_dataset_info, dataset_info_file, indent=2)
logger.info("Dataset saved in {}".format(dataset_path))
logger.info(f"Dataset saved in {dataset_path}")

@staticmethod
def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] = None) -> "Dataset":
Expand Down Expand Up @@ -1216,9 +1210,7 @@ def flatten_(self, max_depth=16):
break
self.info.features = Features.from_arrow_schema(self._data.schema)
self._data = update_metadata_with_features(self._data, self.features)
logger.info(
"Flattened dataset from depth {} to depth {}.".format(depth, 1 if depth + 1 < max_depth else "unknown")
)
logger.info(f'Flattened dataset from depth {depth} to depth { 1 if depth + 1 < max_depth else "unknown"}.')

@fingerprint_transform(inplace=False)
def flatten(self, new_fingerprint, max_depth=16) -> "Dataset":
Expand All @@ -1237,9 +1229,7 @@ def flatten(self, new_fingerprint, max_depth=16) -> "Dataset":
break
dataset.info.features = Features.from_arrow_schema(dataset._data.schema)
dataset._data = update_metadata_with_features(dataset._data, dataset.features)
logger.info(
"Flattened dataset from depth {} to depth {}.".format(depth, 1 if depth + 1 < max_depth else "unknown")
)
logger.info(f'Flattened dataset from depth {depth} to depth {1 if depth + 1 < max_depth else "unknown"}.')
dataset._fingerprint = new_fingerprint
return dataset

Expand Down Expand Up @@ -1688,9 +1678,7 @@ def set_format(
columns = list(columns)
if columns is not None and any(col not in self._data.column_names for col in columns):
raise ValueError(
"Columns {} not in the dataset. Current columns in the dataset: {}".format(
list(filter(lambda col: col not in self._data.column_names, columns)), self._data.column_names
)
f"Columns {list(filter(lambda col: col not in self._data.column_names, columns))} not in the dataset. Current columns in the dataset: {self._data.column_names}"
)
if columns is not None:
columns = columns.copy() # Ensures modifications made to the list after this call don't cause bugs
Expand Down Expand Up @@ -2002,20 +1990,15 @@ def decorated(item, *args, **kwargs):
for input_column in input_columns:
if input_column not in self._data.column_names:
raise ValueError(
"Input column {} not in the dataset. Current columns in the dataset: {}".format(
input_column, self._data.column_names
)
f"Input column {input_column} not in the dataset. Current columns in the dataset: {self._data.column_names}"
)

if isinstance(remove_columns, str):
remove_columns = [remove_columns]

if remove_columns is not None and any(col not in self._data.column_names for col in remove_columns):
raise ValueError(
"Column to remove {} not in the dataset. Current columns in the dataset: {}".format(
list(filter(lambda col: col not in self._data.column_names, remove_columns)),
self._data.column_names,
)
f"Column to remove {list(filter(lambda col: col not in self._data.column_names, remove_columns))} not in the dataset. Current columns in the dataset: {self._data.column_names}"
)

load_from_cache_file = load_from_cache_file if load_from_cache_file is not None else is_caching_enabled()
Expand Down Expand Up @@ -2057,7 +2040,7 @@ def format_cache_file_name(cache_file_name, rank):
sep = cache_file_name.rindex(".")
base_name, extension = cache_file_name[:sep], cache_file_name[sep:]
cache_file_name = base_name + suffix_template.format(rank=rank, num_proc=num_proc) + extension
logger.info("Process #{} will write at {}".format(rank, cache_file_name))
logger.info(f"Process #{rank} will write at {cache_file_name}")
return cache_file_name

prev_env = deepcopy(os.environ)
Expand Down Expand Up @@ -2126,7 +2109,7 @@ def catch_non_existent_error(func, kwargs):
if nb_of_missing_shards > 0:
with Pool(nb_of_missing_shards, initargs=initargs, initializer=initializer) as pool:
os.environ = prev_env
logger.info("Spawning {} processes".format(num_proc))
logger.info(f"Spawning {num_proc} processes")
results = {
i: pool.apply_async(self.__class__._map_single, kwds=kwds)
for i, (kwds, cached_shard) in enumerate(zip(kwds_per_shard, transformed_shards))
Expand All @@ -2143,7 +2126,7 @@ def catch_non_existent_error(func, kwargs):
transformed_shards.count(None) == 0
), "All shards have to be defined Datasets, none should still be missing."

logger.info("Concatenating {} shards".format(num_proc))
logger.info(f"Concatenating {num_proc} shards")
result = concatenate_datasets(transformed_shards)
if new_fingerprint is not None:
result._fingerprint = new_fingerprint
Expand Down Expand Up @@ -2240,7 +2223,7 @@ def _map_single(
# current dataset file and the mapping args
cache_file_name = self._get_cache_file_path(new_fingerprint)
if os.path.exists(cache_file_name) and load_from_cache_file:
logger.warning("Loading cached processed dataset at %s", cache_file_name)
logger.warning(f"Loading cached processed dataset at {cache_file_name}")
info = self.info.copy()
info.features = features
info.task_templates = None
Expand All @@ -2262,9 +2245,7 @@ def validate_function_output(processed_inputs, indices):
"""Validate output of the map function."""
if processed_inputs is not None and not isinstance(processed_inputs, (Mapping, pa.Table)):
raise TypeError(
"Provided `function` which is applied to all elements of table returns a variable of type {}. Make sure provided `function` returns a variable of type `dict` (or a pyarrow table) to update the dataset or `None` if you are only interested in side effects.".format(
type(processed_inputs)
)
f"Provided `function` which is applied to all elements of table returns a variable of type {type(processed_inputs)}. Make sure provided `function` returns a variable of type `dict` (or a pyarrow table) to update the dataset or `None` if you are only interested in side effects."
)
elif isinstance(indices, list) and isinstance(processed_inputs, Mapping):
allowed_batch_return_types = (list, np.ndarray)
Expand All @@ -2273,9 +2254,7 @@ def validate_function_output(processed_inputs, indices):
)
if all_dict_values_are_lists is False:
raise TypeError(
"Provided `function` which is applied to all elements of table returns a `dict` of types {}. When using `batched=True`, make sure provided `function` returns a `dict` of types like `{}`.".format(
[type(x) for x in processed_inputs.values()], allowed_batch_return_types
)
"Provided `function` which is applied to all elements of table returns a `dict` of types {[type(x) for x in processed_inputs.values()]}. When using `batched=True`, make sure provided `function` returns a `dict` of types like `{allowed_batch_return_types}`."
)

def apply_function_on_filtered_inputs(inputs, indices, check_same_num_examples=False, offset=0):
Expand Down Expand Up @@ -2338,7 +2317,7 @@ def init_buffer_and_writer():
)
else:
buf_writer = None
logger.info("Caching processed dataset at %s", cache_file_name)
logger.info(f"Caching processed dataset at {cache_file_name}")
tmp_file = tempfile.NamedTemporaryFile("wb", dir=os.path.dirname(cache_file_name), delete=False)
writer = ArrowWriter(
features=writer_features,
Expand Down Expand Up @@ -2653,7 +2632,7 @@ def select(
)
else:
buf_writer = None
logger.info("Caching indices mapping at %s", indices_cache_file_name)
logger.info(f"Caching indices mapping at {indices_cache_file_name}")
tmp_file = tempfile.NamedTemporaryFile("wb", dir=os.path.dirname(indices_cache_file_name), delete=False)
writer = ArrowWriter(
path=tmp_file.name, writer_batch_size=writer_batch_size, fingerprint=new_fingerprint, unit="indices"
Expand Down Expand Up @@ -2738,10 +2717,7 @@ def sort(
# Check the column name
if not isinstance(column, str) or column not in self._data.column_names:
raise ValueError(
"Column '{}' not found in the dataset. Please provide a column selected in: {}".format(
column,
self._data.column_names,
)
f"Column '{column}' not found in the dataset. Please provide a column selected in: {self._data.column_names}"
)

# Check if we've already cached this computation (indexed by a hash)
Expand All @@ -2750,7 +2726,7 @@ def sort(
# we create a unique hash from the function, current dataset file and the mapping args
indices_cache_file_name = self._get_cache_file_path(new_fingerprint)
if os.path.exists(indices_cache_file_name) and load_from_cache_file:
logger.warning("Loading cached sorted indices for dataset at %s", indices_cache_file_name)
logger.warning(f"Loading cached sorted indices for dataset at {indices_cache_file_name}")
return self._new_dataset_with_indices(
fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name
)
Expand Down Expand Up @@ -2833,7 +2809,7 @@ def shuffle(
# we create a unique hash from the function, current dataset file and the mapping args
indices_cache_file_name = self._get_cache_file_path(new_fingerprint)
if os.path.exists(indices_cache_file_name) and load_from_cache_file:
logger.warning("Loading cached shuffled indices for dataset at %s", indices_cache_file_name)
logger.warning(f"Loading cached shuffled indices for dataset at {indices_cache_file_name}")
return self._new_dataset_with_indices(
fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name
)
Expand Down Expand Up @@ -3008,9 +2984,7 @@ def train_test_split(
and load_from_cache_file
):
logger.warning(
"Loading cached split indices for dataset at %s and %s",
train_indices_cache_file_name,
test_indices_cache_file_name,
f"Loading cached split indices for dataset at {train_indices_cache_file_name} and {test_indices_cache_file_name}"
)
return DatasetDict(
{
Expand Down

1 comment on commit 3edbeb0

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.082627 / 0.082627 (0.071274) 0.005630 / 0.005630 (-0.005379) 0.040398 / 0.040398 (0.001890) 0.040890 / 0.040890 (0.017780) 0.376869 / 0.376869 (0.100971) 0.426886 / 0.426886 (0.103406) 0.091343 / 0.091343 (0.083358) 0.004829 / 0.004829 (0.000501) 0.018643 / 0.018643 (0.014392) 0.049138 / 0.049138 (0.012086) 0.381204 / 0.381204 (0.122715) 0.419025 / 0.419025 (0.125184) 0.113171 / 0.113171 (-0.015375) 0.014196 / 0.014196 (-0.061450) 0.333629 / 0.333629 (-0.085643) 0.059720 / 0.059720 (0.016187) 0.376832 / 0.376832 (0.121693) 0.428432 / 0.428432 (0.145233) 0.095511 / 0.095511 (-0.046172) 2.165636 / 2.165636 (0.713481) 2.248224 / 2.248224 (0.755508)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.291797 / 0.291797 (0.273791) 0.538839 / 0.538839 (0.538349) 0.012903 / 0.012903 (0.012703) 0.000116 / 0.000116 (0.000061)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.043530 / 0.043530 (0.006119) 0.027970 / 0.027970 (0.013445) 0.032356 / 0.032356 (-0.144201) 0.242599 / 0.242599 (-0.494536) 0.032140 / 0.032140 (-0.264198)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.652377 / 0.652377 (0.437168) 6.329183 / 6.329183 (4.251529) 2.497030 / 2.497030 (0.992910) 2.141530 / 2.141530 (0.600336) 2.147176 / 2.147176 (0.678686) 0.749860 / 0.749860 (-3.834917) 6.638684 / 6.638684 (2.892972) 3.327412 / 3.327412 (-1.942450) 1.373150 / 1.373150 (-3.192527) 0.084610 / 0.084610 (-0.339665) 0.014509 / 0.014509 (0.006902) 0.808213 / 0.808213 (0.582168) 7.931157 / 7.931157 (5.662228) 3.429692 / 3.429692 (-52.014933) 2.825592 / 2.825592 (-4.050885) 2.822999 / 2.822999 (0.680926) 0.920192 / 0.920192 (-3.885035) 0.184314 / 0.184314 (-6.316351) 0.073440 / 0.073440 (-0.002029)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 2.044520 / 2.044520 (0.202733) 14.737549 / 14.737549 (6.663241) 42.493383 / 42.493383 (32.301991) 0.960535 / 0.960535 (0.280112) 0.690374 / 0.690374 (0.156173) 0.471176 / 0.471176 (-0.108107) 0.708646 / 0.708646 (0.274282) 0.338522 / 0.338522 (-0.201816) 0.358918 / 0.358918 (-1.028018)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.081290 / 0.081290 (0.069938) 0.005268 / 0.005268 (-0.005741) 0.037764 / 0.037764 (-0.000744) 0.037648 / 0.037648 (0.014538) 0.419997 / 0.419997 (0.144099) 0.480385 / 0.480385 (0.156905) 0.100561 / 0.100561 (0.092576) 0.005314 / 0.005314 (0.000985) 0.008578 / 0.008578 (0.004328) 0.041594 / 0.041594 (0.004542) 0.430686 / 0.430686 (0.172197) 0.491738 / 0.491738 (0.197897) 0.108441 / 0.108441 (-0.020105) 0.014754 / 0.014754 (-0.060892) 0.332701 / 0.332701 (-0.086570) 0.058112 / 0.058112 (0.014580) 0.438743 / 0.438743 (0.183604) 0.474824 / 0.474824 (0.191624) 0.093760 / 0.093760 (-0.047923) 2.138795 / 2.138795 (0.686640) 2.194720 / 2.194720 (0.702004)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.309953 / 0.309953 (0.291947) 0.559475 / 0.559475 (0.558986) 0.000892 / 0.000892 (0.000692) 0.000124 / 0.000124 (0.000069)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.038177 / 0.038177 (0.000766) 0.026361 / 0.026361 (0.011835) 0.032045 / 0.032045 (-0.144512) 0.245970 / 0.245970 (-0.491166) 0.034209 / 0.034209 (-0.262130)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.633296 / 0.633296 (0.418087) 6.296914 / 6.296914 (4.219260) 2.493012 / 2.493012 (0.988892) 2.095088 / 2.095088 (0.553893) 2.205526 / 2.205526 (0.737036) 0.710390 / 0.710390 (-3.874387) 6.866486 / 6.866486 (3.120774) 4.586534 / 4.586534 (-0.683328) 1.452306 / 1.452306 (-3.113371) 0.082681 / 0.082681 (-0.341594) 0.014338 / 0.014338 (0.006731) 0.789771 / 0.789771 (0.563726) 7.929550 / 7.929550 (5.660622) 3.215731 / 3.215731 (-52.228893) 2.531082 / 2.531082 (-4.345395) 2.689137 / 2.689137 (0.547064) 0.923414 / 0.923414 (-3.881813) 0.184420 / 0.184420 (-6.316244) 0.073043 / 0.073043 (-0.002427)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.981260 / 1.981260 (0.139473) 14.753032 / 14.753032 (6.678724) 42.151964 / 42.151964 (31.960572) 0.969709 / 0.969709 (0.289285) 0.700533 / 0.700533 (0.166332) 0.464643 / 0.464643 (-0.114640) 0.759426 / 0.759426 (0.325062) 0.363891 / 0.363891 (-0.176446) 0.351514 / 0.351514 (-1.035422)

CML watermark

Please sign in to comment.