Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

f-string formatting #3277

Merged
merged 6 commits into from
Nov 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions benchmarks/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ def format_json_to_md(input_json_file, output_md_file):
old_val = metric_vals.get("old", None)
dif_val = metric_vals.get("diff", None)

val_str = " {:f}".format(new_val) if isinstance(new_val, (int, float)) else "None"
val_str = f" {new_val:f}" if isinstance(new_val, (int, float)) else "None"

if old_val is not None:
val_str += " / {:f}".format(old_val) if isinstance(old_val, (int, float)) else "None"
val_str += f" / {new_val:f}" if isinstance(old_val, (int, float)) else "None"
if dif_val is not None:
val_str += " ({:f})".format(dif_val) if isinstance(dif_val, (int, float)) else "None"
val_str += f" ({dif_val:f})" if isinstance(dif_val, (int, float)) else "None"

title += " " + metric_name + " |"
lines += "---|"
Expand Down
16 changes: 8 additions & 8 deletions metrics/coval/coval.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,17 +210,17 @@ def get_coref_infos(
if remove_nested:
logger.info(
"Number of removed nested coreferring mentions in the key "
"annotation: %s; and system annotation: %s" % (key_nested_coref_num, sys_nested_coref_num)
f"annotation: {key_nested_coref_num}; and system annotation: {sys_nested_coref_num}"
)
logger.info(
"Number of resulting singleton clusters in the key "
"annotation: %s; and system annotation: %s" % (key_removed_nested_clusters, sys_removed_nested_clusters)
f"annotation: {key_removed_nested_clusters}; and system annotation: {sys_removed_nested_clusters}"
)

if not keep_singletons:
logger.info(
"%d and %d singletons are removed from the key and system "
"files, respectively" % (key_singletons_num, sys_singletons_num)
f"{key_singletons_num:d} and {sys_singletons_num:d} singletons are removed from the key and system "
"files, respectively"
)

return doc_coref_infos
Expand All @@ -242,14 +242,14 @@ def evaluate(key_lines, sys_lines, metrics, NP_only, remove_nested, keep_singlet

logger.info(
name.ljust(10),
"Recall: %.2f" % (recall * 100),
" Precision: %.2f" % (precision * 100),
" F1: %.2f" % (f1 * 100),
f"Recall: {recall * 100:.2f}",
f" Precision: {precision * 100:.2f}",
f" F1: {f1 * 100:.2f}",
)

if conll_subparts_num == 3:
conll = (conll / 3) * 100
logger.info("CoNLL score: %.2f" % conll)
logger.info(f"CoNLL score: {conll:.2f}")
output_scores.update({"conll_score": conll})

return output_scores
Expand Down
8 changes: 4 additions & 4 deletions metrics/squad_v2/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def get_raw_scores(dataset, preds):
# For unanswerable questions, only correct answer is empty string
gold_answers = [""]
if qid not in preds:
print("Missing prediction for %s" % qid)
print(f"Missing prediction for {qid}")
continue
a_pred = preds[qid]
# Take max over all gold answers
Expand Down Expand Up @@ -156,7 +156,7 @@ def make_eval_dict(exact_scores, f1_scores, qid_list=None):

def merge_eval(main_eval, new_eval, prefix):
for k in new_eval:
main_eval["%s_%s" % (prefix, k)] = new_eval[k]
main_eval[f"{prefix}_{k}"] = new_eval[k]


def plot_pr_curve(precisions, recalls, out_image, title):
Expand Down Expand Up @@ -238,8 +238,8 @@ def histogram_na_prob(na_probs, qid_list, image_dir, name):
plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
plt.xlabel("Model probability of no-answer")
plt.ylabel("Proportion of dataset")
plt.title("Histogram of no-answer probability: %s" % name)
plt.savefig(os.path.join(image_dir, "na_prob_hist_%s.png" % name))
plt.title(f"Histogram of no-answer probability: {name}")
plt.savefig(os.path.join(image_dir, f"na_prob_hist_{name}.png"))
plt.clf()


Expand Down
6 changes: 3 additions & 3 deletions metrics/super_glue/record_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def evaluate(dataset, predictions):
for qa in passage["qas"]:
total += 1
if qa["id"] not in predictions:
message = "Unanswered question {} will receive score 0.".format(qa["id"])
message = f'Unanswered question {qa["id"]} will receive score 0.'
print(message, file=sys.stderr)
continue

Expand Down Expand Up @@ -95,7 +95,7 @@ def evaluate(dataset, predictions):
dataset_json = json.load(data_file)
if dataset_json["version"] != expected_version:
print(
"Evaluation expects v-{}, but got dataset with v-{}".format(expected_version, dataset_json["version"]),
f'Evaluation expects v-{expected_version}, but got dataset with v-{dataset_json["version"]}',
file=sys.stderr,
)
dataset = dataset_json["data"]
Expand All @@ -106,6 +106,6 @@ def evaluate(dataset, predictions):
metrics, correct_ids = evaluate(dataset, predictions)

if args.output_correct_ids:
print("Output {} correctly answered question IDs.".format(len(correct_ids)))
print(f"Output {len(correct_ids)} correctly answered question IDs.")
with open("correct_ids.json", "w") as f:
json.dump(correct_ids, f)
2 changes: 1 addition & 1 deletion metrics/super_glue/super_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def evaluate_multirc(ids_preds, labels):
"""
question_map = {}
for id_pred, label in zip(ids_preds, labels):
question_id = "{}-{}".format(id_pred["idx"]["paragraph"], id_pred["idx"]["question"])
question_id = f'{id_pred["idx"]["paragraph"]}-{id_pred["idx"]["question"]}'
pred = id_pred["prediction"]
if question_id in question_map:
question_map[question_id].append((pred, label))
Expand Down
68 changes: 21 additions & 47 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,9 +621,7 @@ def __init__(
assert self._fingerprint is not None, "Fingerprint can't be None in a Dataset object"
if self.info.features.type != inferred_features.type:
raise ValueError(
"External features info don't match the dataset:\nGot\n{}\nwith type\n{}\n\nbut expected something like\n{}\nwith type\n{}".format(
self.info.features, self.info.features.type, inferred_features, inferred_features.type
)
f"External features info don't match the dataset:\nGot\n{self.info.features}\nwith type\n{self.info.features.type}\n\nbut expected something like\n{inferred_features}\nwith type\n{inferred_features.type}"
)

if self._indices is not None:
Expand Down Expand Up @@ -735,9 +733,7 @@ def from_pandas(
"""
if info is not None and features is not None and info.features != features:
raise ValueError(
"Features specified in `features` and `info.features` can't be different:\n{}\n{}".format(
features, info.features
)
f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
)
features = features if features is not None else info.features if info is not None else None
if info is None:
Expand Down Expand Up @@ -768,9 +764,7 @@ def from_dict(
"""
if info is not None and features is not None and info.features != features:
raise ValueError(
"Features specified in `features` and `info.features` can't be different:\n{}\n{}".format(
features, info.features
)
f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
)
features = features if features is not None else info.features if info is not None else None
if info is None:
Expand Down Expand Up @@ -1008,7 +1002,7 @@ def save_to_disk(self, dataset_path: str, fs=None):
# Sort only the first level of keys, or we might shuffle fields of nested features if we use sort_keys=True
sorted_keys_dataset_info = {key: dataset_info[key] for key in sorted(dataset_info)}
json.dump(sorted_keys_dataset_info, dataset_info_file, indent=2)
logger.info("Dataset saved in {}".format(dataset_path))
logger.info(f"Dataset saved in {dataset_path}")

@staticmethod
def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] = None) -> "Dataset":
Expand Down Expand Up @@ -1216,9 +1210,7 @@ def flatten_(self, max_depth=16):
break
self.info.features = Features.from_arrow_schema(self._data.schema)
self._data = update_metadata_with_features(self._data, self.features)
logger.info(
"Flattened dataset from depth {} to depth {}.".format(depth, 1 if depth + 1 < max_depth else "unknown")
)
logger.info(f'Flattened dataset from depth {depth} to depth { 1 if depth + 1 < max_depth else "unknown"}.')

@fingerprint_transform(inplace=False)
def flatten(self, new_fingerprint, max_depth=16) -> "Dataset":
Expand All @@ -1237,9 +1229,7 @@ def flatten(self, new_fingerprint, max_depth=16) -> "Dataset":
break
dataset.info.features = Features.from_arrow_schema(dataset._data.schema)
dataset._data = update_metadata_with_features(dataset._data, dataset.features)
logger.info(
"Flattened dataset from depth {} to depth {}.".format(depth, 1 if depth + 1 < max_depth else "unknown")
)
logger.info(f'Flattened dataset from depth {depth} to depth {1 if depth + 1 < max_depth else "unknown"}.')
dataset._fingerprint = new_fingerprint
return dataset

Expand Down Expand Up @@ -1688,9 +1678,7 @@ def set_format(
columns = list(columns)
if columns is not None and any(col not in self._data.column_names for col in columns):
raise ValueError(
"Columns {} not in the dataset. Current columns in the dataset: {}".format(
list(filter(lambda col: col not in self._data.column_names, columns)), self._data.column_names
)
f"Columns {list(filter(lambda col: col not in self._data.column_names, columns))} not in the dataset. Current columns in the dataset: {self._data.column_names}"
)
if columns is not None:
columns = columns.copy() # Ensures modifications made to the list after this call don't cause bugs
Expand Down Expand Up @@ -2002,20 +1990,15 @@ def decorated(item, *args, **kwargs):
for input_column in input_columns:
if input_column not in self._data.column_names:
raise ValueError(
"Input column {} not in the dataset. Current columns in the dataset: {}".format(
input_column, self._data.column_names
)
f"Input column {input_column} not in the dataset. Current columns in the dataset: {self._data.column_names}"
)

if isinstance(remove_columns, str):
remove_columns = [remove_columns]

if remove_columns is not None and any(col not in self._data.column_names for col in remove_columns):
raise ValueError(
"Column to remove {} not in the dataset. Current columns in the dataset: {}".format(
list(filter(lambda col: col not in self._data.column_names, remove_columns)),
self._data.column_names,
)
f"Column to remove {list(filter(lambda col: col not in self._data.column_names, remove_columns))} not in the dataset. Current columns in the dataset: {self._data.column_names}"
)

load_from_cache_file = load_from_cache_file if load_from_cache_file is not None else is_caching_enabled()
Expand Down Expand Up @@ -2057,7 +2040,7 @@ def format_cache_file_name(cache_file_name, rank):
sep = cache_file_name.rindex(".")
base_name, extension = cache_file_name[:sep], cache_file_name[sep:]
cache_file_name = base_name + suffix_template.format(rank=rank, num_proc=num_proc) + extension
logger.info("Process #{} will write at {}".format(rank, cache_file_name))
logger.info(f"Process #{rank} will write at {cache_file_name}")
return cache_file_name

prev_env = deepcopy(os.environ)
Expand Down Expand Up @@ -2126,7 +2109,7 @@ def catch_non_existent_error(func, kwargs):
if nb_of_missing_shards > 0:
with Pool(nb_of_missing_shards, initargs=initargs, initializer=initializer) as pool:
os.environ = prev_env
logger.info("Spawning {} processes".format(num_proc))
logger.info(f"Spawning {num_proc} processes")
results = {
i: pool.apply_async(self.__class__._map_single, kwds=kwds)
for i, (kwds, cached_shard) in enumerate(zip(kwds_per_shard, transformed_shards))
Expand All @@ -2143,7 +2126,7 @@ def catch_non_existent_error(func, kwargs):
transformed_shards.count(None) == 0
), "All shards have to be defined Datasets, none should still be missing."

logger.info("Concatenating {} shards".format(num_proc))
logger.info(f"Concatenating {num_proc} shards")
result = concatenate_datasets(transformed_shards)
if new_fingerprint is not None:
result._fingerprint = new_fingerprint
Expand Down Expand Up @@ -2240,7 +2223,7 @@ def _map_single(
# current dataset file and the mapping args
cache_file_name = self._get_cache_file_path(new_fingerprint)
if os.path.exists(cache_file_name) and load_from_cache_file:
logger.warning("Loading cached processed dataset at %s", cache_file_name)
logger.warning(f"Loading cached processed dataset at {cache_file_name}")
info = self.info.copy()
info.features = features
info.task_templates = None
Expand All @@ -2262,9 +2245,7 @@ def validate_function_output(processed_inputs, indices):
"""Validate output of the map function."""
if processed_inputs is not None and not isinstance(processed_inputs, (Mapping, pa.Table)):
raise TypeError(
"Provided `function` which is applied to all elements of table returns a variable of type {}. Make sure provided `function` returns a variable of type `dict` (or a pyarrow table) to update the dataset or `None` if you are only interested in side effects.".format(
type(processed_inputs)
)
f"Provided `function` which is applied to all elements of table returns a variable of type {type(processed_inputs)}. Make sure provided `function` returns a variable of type `dict` (or a pyarrow table) to update the dataset or `None` if you are only interested in side effects."
)
elif isinstance(indices, list) and isinstance(processed_inputs, Mapping):
allowed_batch_return_types = (list, np.ndarray)
Expand All @@ -2273,9 +2254,7 @@ def validate_function_output(processed_inputs, indices):
)
if all_dict_values_are_lists is False:
raise TypeError(
"Provided `function` which is applied to all elements of table returns a `dict` of types {}. When using `batched=True`, make sure provided `function` returns a `dict` of types like `{}`.".format(
[type(x) for x in processed_inputs.values()], allowed_batch_return_types
)
"Provided `function` which is applied to all elements of table returns a `dict` of types {[type(x) for x in processed_inputs.values()]}. When using `batched=True`, make sure provided `function` returns a `dict` of types like `{allowed_batch_return_types}`."
)

def apply_function_on_filtered_inputs(inputs, indices, check_same_num_examples=False, offset=0):
Expand Down Expand Up @@ -2338,7 +2317,7 @@ def init_buffer_and_writer():
)
else:
buf_writer = None
logger.info("Caching processed dataset at %s", cache_file_name)
logger.info(f"Caching processed dataset at {cache_file_name}")
tmp_file = tempfile.NamedTemporaryFile("wb", dir=os.path.dirname(cache_file_name), delete=False)
writer = ArrowWriter(
features=writer_features,
Expand Down Expand Up @@ -2653,7 +2632,7 @@ def select(
)
else:
buf_writer = None
logger.info("Caching indices mapping at %s", indices_cache_file_name)
logger.info(f"Caching indices mapping at {indices_cache_file_name}")
tmp_file = tempfile.NamedTemporaryFile("wb", dir=os.path.dirname(indices_cache_file_name), delete=False)
writer = ArrowWriter(
path=tmp_file.name, writer_batch_size=writer_batch_size, fingerprint=new_fingerprint, unit="indices"
Expand Down Expand Up @@ -2738,10 +2717,7 @@ def sort(
# Check the column name
if not isinstance(column, str) or column not in self._data.column_names:
raise ValueError(
"Column '{}' not found in the dataset. Please provide a column selected in: {}".format(
column,
self._data.column_names,
)
f"Column '{column}' not found in the dataset. Please provide a column selected in: {self._data.column_names}"
)

# Check if we've already cached this computation (indexed by a hash)
Expand All @@ -2750,7 +2726,7 @@ def sort(
# we create a unique hash from the function, current dataset file and the mapping args
indices_cache_file_name = self._get_cache_file_path(new_fingerprint)
if os.path.exists(indices_cache_file_name) and load_from_cache_file:
logger.warning("Loading cached sorted indices for dataset at %s", indices_cache_file_name)
logger.warning(f"Loading cached sorted indices for dataset at {indices_cache_file_name}")
return self._new_dataset_with_indices(
fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name
)
Expand Down Expand Up @@ -2833,7 +2809,7 @@ def shuffle(
# we create a unique hash from the function, current dataset file and the mapping args
indices_cache_file_name = self._get_cache_file_path(new_fingerprint)
if os.path.exists(indices_cache_file_name) and load_from_cache_file:
logger.warning("Loading cached shuffled indices for dataset at %s", indices_cache_file_name)
logger.warning(f"Loading cached shuffled indices for dataset at {indices_cache_file_name}")
return self._new_dataset_with_indices(
fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name
)
Expand Down Expand Up @@ -3008,9 +2984,7 @@ def train_test_split(
and load_from_cache_file
):
logger.warning(
"Loading cached split indices for dataset at %s and %s",
train_indices_cache_file_name,
test_indices_cache_file_name,
f"Loading cached split indices for dataset at {train_indices_cache_file_name} and {test_indices_cache_file_name}"
)
return DatasetDict(
{
Expand Down