huggingface · lhoestq · Nov 17, 2021 · Nov 15, 2021 · Nov 15, 2021 · Nov 16, 2021
diff --git a/benchmarks/format.py b/benchmarks/format.py
@@ -24,12 +24,12 @@ def format_json_to_md(input_json_file, output_md_file):
             old_val = metric_vals.get("old", None)
             dif_val = metric_vals.get("diff", None)
 
-            val_str = " {:f}".format(new_val) if isinstance(new_val, (int, float)) else "None"
+            val_str = f" {new_val:f}" if isinstance(new_val, (int, float)) else "None"
 
             if old_val is not None:
-                val_str += " / {:f}".format(old_val) if isinstance(old_val, (int, float)) else "None"
+                val_str += f" / {new_val:f}" if isinstance(old_val, (int, float)) else "None"
             if dif_val is not None:
-                val_str += " ({:f})".format(dif_val) if isinstance(dif_val, (int, float)) else "None"
+                val_str += f" ({dif_val:f})" if isinstance(dif_val, (int, float)) else "None"
 
             title += " " + metric_name + " |"
             lines += "---|"

diff --git a/metrics/coval/coval.py b/metrics/coval/coval.py
@@ -210,17 +210,17 @@ def get_coref_infos(
     if remove_nested:
         logger.info(
             "Number of removed nested coreferring mentions in the key "
-            "annotation: %s; and system annotation: %s" % (key_nested_coref_num, sys_nested_coref_num)
+            f"annotation: {key_nested_coref_num}; and system annotation: {sys_nested_coref_num}"
         )
         logger.info(
             "Number of resulting singleton clusters in the key "
-            "annotation: %s; and system annotation: %s" % (key_removed_nested_clusters, sys_removed_nested_clusters)
+            f"annotation: {key_removed_nested_clusters}; and system annotation: {sys_removed_nested_clusters}"
         )
 
     if not keep_singletons:
         logger.info(
-            "%d and %d singletons are removed from the key and system "
-            "files, respectively" % (key_singletons_num, sys_singletons_num)
+            f"{key_singletons_num:d} and {sys_singletons_num:d} singletons are removed from the key and system "
+            "files, respectively"
         )
 
     return doc_coref_infos
@@ -242,14 +242,14 @@ def evaluate(key_lines, sys_lines, metrics, NP_only, remove_nested, keep_singlet
 
         logger.info(
             name.ljust(10),
-            "Recall: %.2f" % (recall * 100),
-            " Precision: %.2f" % (precision * 100),
-            " F1: %.2f" % (f1 * 100),
+            f"Recall: {recall * 100:.2f}",
+            f" Precision: {precision * 100:.2f}",
+            f" F1: {f1 * 100:.2f}",
         )
 
     if conll_subparts_num == 3:
         conll = (conll / 3) * 100
-        logger.info("CoNLL score: %.2f" % conll)
+        logger.info(f"CoNLL score: {conll:.2f}")
         output_scores.update({"conll_score": conll})
 
     return output_scores

diff --git a/metrics/squad_v2/evaluate.py b/metrics/squad_v2/evaluate.py
@@ -113,7 +113,7 @@ def get_raw_scores(dataset, preds):
                     # For unanswerable questions, only correct answer is empty string
                     gold_answers = [""]
                 if qid not in preds:
-                    print("Missing prediction for %s" % qid)
+                    print(f"Missing prediction for {qid}")
                     continue
                 a_pred = preds[qid]
                 # Take max over all gold answers
@@ -156,7 +156,7 @@ def make_eval_dict(exact_scores, f1_scores, qid_list=None):
 
 def merge_eval(main_eval, new_eval, prefix):
     for k in new_eval:
-        main_eval["%s_%s" % (prefix, k)] = new_eval[k]
+        main_eval[f"{prefix}_{k}"] = new_eval[k]
 
 
 def plot_pr_curve(precisions, recalls, out_image, title):
@@ -238,8 +238,8 @@ def histogram_na_prob(na_probs, qid_list, image_dir, name):
     plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
     plt.xlabel("Model probability of no-answer")
     plt.ylabel("Proportion of dataset")
-    plt.title("Histogram of no-answer probability: %s" % name)
-    plt.savefig(os.path.join(image_dir, "na_prob_hist_%s.png" % name))
+    plt.title(f"Histogram of no-answer probability: {name}")
+    plt.savefig(os.path.join(image_dir, f"na_prob_hist_{name}.png"))
     plt.clf()
 
 

diff --git a/metrics/super_glue/record_evaluation.py b/metrics/super_glue/record_evaluation.py
@@ -63,7 +63,7 @@ def evaluate(dataset, predictions):
         for qa in passage["qas"]:
             total += 1
             if qa["id"] not in predictions:
-                message = "Unanswered question {} will receive score 0.".format(qa["id"])
+                message = f'Unanswered question {qa["id"]} will receive score 0.'
                 print(message, file=sys.stderr)
                 continue
 
@@ -95,7 +95,7 @@ def evaluate(dataset, predictions):
         dataset_json = json.load(data_file)
         if dataset_json["version"] != expected_version:
             print(
-                "Evaluation expects v-{}, but got dataset with v-{}".format(expected_version, dataset_json["version"]),
+                f'Evaluation expects v-{expected_version}, but got dataset with v-{dataset_json["version"]}',
                 file=sys.stderr,
             )
         dataset = dataset_json["data"]
@@ -106,6 +106,6 @@ def evaluate(dataset, predictions):
     metrics, correct_ids = evaluate(dataset, predictions)
 
     if args.output_correct_ids:
-        print("Output {} correctly answered question IDs.".format(len(correct_ids)))
+        print(f"Output {len(correct_ids)} correctly answered question IDs.")
         with open("correct_ids.json", "w") as f:
             json.dump(correct_ids, f)
diff --git a/metrics/super_glue/super_glue.py b/metrics/super_glue/super_glue.py
@@ -125,7 +125,7 @@ def evaluate_multirc(ids_preds, labels):
     """
     question_map = {}
     for id_pred, label in zip(ids_preds, labels):
-        question_id = "{}-{}".format(id_pred["idx"]["paragraph"], id_pred["idx"]["question"])
+        question_id = f'{id_pred["idx"]["paragraph"]}-{id_pred["idx"]["question"]}'
         pred = id_pred["prediction"]
         if question_id in question_map:
             question_map[question_id].append((pred, label))

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -621,9 +621,7 @@ def __init__(
         assert self._fingerprint is not None, "Fingerprint can't be None in a Dataset object"
         if self.info.features.type != inferred_features.type:
             raise ValueError(
-                "External features info don't match the dataset:\nGot\n{}\nwith type\n{}\n\nbut expected something like\n{}\nwith type\n{}".format(
-                    self.info.features, self.info.features.type, inferred_features, inferred_features.type
-                )
+                f"External features info don't match the dataset:\nGot\n{self.info.features}\nwith type\n{self.info.features.type}\n\nbut expected something like\n{inferred_features}\nwith type\n{inferred_features.type}"
             )
 
         if self._indices is not None:
@@ -735,9 +733,7 @@ def from_pandas(
         """
         if info is not None and features is not None and info.features != features:
             raise ValueError(
-                "Features specified in `features` and `info.features` can't be different:\n{}\n{}".format(
-                    features, info.features
-                )
+                f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
             )
         features = features if features is not None else info.features if info is not None else None
         if info is None:
@@ -768,9 +764,7 @@ def from_dict(
         """
         if info is not None and features is not None and info.features != features:
             raise ValueError(
-                "Features specified in `features` and `info.features` can't be different:\n{}\n{}".format(
-                    features, info.features
-                )
+                f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
             )
         features = features if features is not None else info.features if info is not None else None
         if info is None:
@@ -1008,7 +1002,7 @@ def save_to_disk(self, dataset_path: str, fs=None):
             # Sort only the first level of keys, or we might shuffle fields of nested features if we use sort_keys=True
             sorted_keys_dataset_info = {key: dataset_info[key] for key in sorted(dataset_info)}
             json.dump(sorted_keys_dataset_info, dataset_info_file, indent=2)
-        logger.info("Dataset saved in {}".format(dataset_path))
+        logger.info(f"Dataset saved in {dataset_path}")
 
     @staticmethod
     def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] = None) -> "Dataset":
@@ -1216,9 +1210,7 @@ def flatten_(self, max_depth=16):
                 break
         self.info.features = Features.from_arrow_schema(self._data.schema)
         self._data = update_metadata_with_features(self._data, self.features)
-        logger.info(
-            "Flattened dataset from depth {} to depth {}.".format(depth, 1 if depth + 1 < max_depth else "unknown")
-        )
+        logger.info(f'Flattened dataset from depth {depth} to depth { 1 if depth + 1 < max_depth else "unknown"}.')
 
     @fingerprint_transform(inplace=False)
     def flatten(self, new_fingerprint, max_depth=16) -> "Dataset":
@@ -1237,9 +1229,7 @@ def flatten(self, new_fingerprint, max_depth=16) -> "Dataset":
                 break
         dataset.info.features = Features.from_arrow_schema(dataset._data.schema)
         dataset._data = update_metadata_with_features(dataset._data, dataset.features)
-        logger.info(
-            "Flattened dataset from depth {} to depth {}.".format(depth, 1 if depth + 1 < max_depth else "unknown")
-        )
+        logger.info(f'Flattened dataset from depth {depth} to depth {1 if depth + 1 < max_depth else "unknown"}.')
         dataset._fingerprint = new_fingerprint
         return dataset
 
@@ -1688,9 +1678,7 @@ def set_format(
             columns = list(columns)
         if columns is not None and any(col not in self._data.column_names for col in columns):
             raise ValueError(
-                "Columns {} not in the dataset. Current columns in the dataset: {}".format(
-                    list(filter(lambda col: col not in self._data.column_names, columns)), self._data.column_names
-                )
+                f"Columns {list(filter(lambda col: col not in self._data.column_names, columns))} not in the dataset. Current columns in the dataset: {self._data.column_names}"
             )
         if columns is not None:
             columns = columns.copy()  # Ensures modifications made to the list after this call don't cause bugs
@@ -2002,20 +1990,15 @@ def decorated(item, *args, **kwargs):
             for input_column in input_columns:
                 if input_column not in self._data.column_names:
                     raise ValueError(
-                        "Input column {} not in the dataset. Current columns in the dataset: {}".format(
-                            input_column, self._data.column_names
-                        )
+                        f"Input column {input_column} not in the dataset. Current columns in the dataset: {self._data.column_names}"
                     )
 
         if isinstance(remove_columns, str):
             remove_columns = [remove_columns]
 
         if remove_columns is not None and any(col not in self._data.column_names for col in remove_columns):
             raise ValueError(
-                "Column to remove {} not in the dataset. Current columns in the dataset: {}".format(
-                    list(filter(lambda col: col not in self._data.column_names, remove_columns)),
-                    self._data.column_names,
-                )
+                f"Column to remove {list(filter(lambda col: col not in self._data.column_names, remove_columns))} not in the dataset. Current columns in the dataset: {self._data.column_names}"
             )
 
         load_from_cache_file = load_from_cache_file if load_from_cache_file is not None else is_caching_enabled()
@@ -2057,7 +2040,7 @@ def format_cache_file_name(cache_file_name, rank):
                 sep = cache_file_name.rindex(".")
                 base_name, extension = cache_file_name[:sep], cache_file_name[sep:]
                 cache_file_name = base_name + suffix_template.format(rank=rank, num_proc=num_proc) + extension
-                logger.info("Process #{} will write at {}".format(rank, cache_file_name))
+                logger.info(f"Process #{rank} will write at {cache_file_name}")
                 return cache_file_name
 
             prev_env = deepcopy(os.environ)
@@ -2126,7 +2109,7 @@ def catch_non_existent_error(func, kwargs):
             if nb_of_missing_shards > 0:
                 with Pool(nb_of_missing_shards, initargs=initargs, initializer=initializer) as pool:
                     os.environ = prev_env
-                    logger.info("Spawning {} processes".format(num_proc))
+                    logger.info(f"Spawning {num_proc} processes")
                     results = {
                         i: pool.apply_async(self.__class__._map_single, kwds=kwds)
                         for i, (kwds, cached_shard) in enumerate(zip(kwds_per_shard, transformed_shards))
@@ -2143,7 +2126,7 @@ def catch_non_existent_error(func, kwargs):
                 transformed_shards.count(None) == 0
             ), "All shards have to be defined Datasets, none should still be missing."
 
-            logger.info("Concatenating {} shards".format(num_proc))
+            logger.info(f"Concatenating {num_proc} shards")
             result = concatenate_datasets(transformed_shards)
             if new_fingerprint is not None:
                 result._fingerprint = new_fingerprint
@@ -2240,7 +2223,7 @@ def _map_single(
                 # current dataset file and the mapping args
                 cache_file_name = self._get_cache_file_path(new_fingerprint)
             if os.path.exists(cache_file_name) and load_from_cache_file:
-                logger.warning("Loading cached processed dataset at %s", cache_file_name)
+                logger.warning(f"Loading cached processed dataset at {cache_file_name}")
                 info = self.info.copy()
                 info.features = features
                 info.task_templates = None
@@ -2262,9 +2245,7 @@ def validate_function_output(processed_inputs, indices):
             """Validate output of the map function."""
             if processed_inputs is not None and not isinstance(processed_inputs, (Mapping, pa.Table)):
                 raise TypeError(
-                    "Provided `function` which is applied to all elements of table returns a variable of type {}. Make sure provided `function` returns a variable of type `dict` (or a pyarrow table) to update the dataset or `None` if you are only interested in side effects.".format(
-                        type(processed_inputs)
-                    )
+                    f"Provided `function` which is applied to all elements of table returns a variable of type {type(processed_inputs)}. Make sure provided `function` returns a variable of type `dict` (or a pyarrow table) to update the dataset or `None` if you are only interested in side effects."
                 )
             elif isinstance(indices, list) and isinstance(processed_inputs, Mapping):
                 allowed_batch_return_types = (list, np.ndarray)
@@ -2273,9 +2254,7 @@ def validate_function_output(processed_inputs, indices):
                 )
                 if all_dict_values_are_lists is False:
                     raise TypeError(
-                        "Provided `function` which is applied to all elements of table returns a `dict` of types {}. When using `batched=True`, make sure provided `function` returns a `dict` of types like `{}`.".format(
-                            [type(x) for x in processed_inputs.values()], allowed_batch_return_types
-                        )
+                        "Provided `function` which is applied to all elements of table returns a `dict` of types {[type(x) for x in processed_inputs.values()]}. When using `batched=True`, make sure provided `function` returns a `dict` of types like `{allowed_batch_return_types}`."
                     )
 
         def apply_function_on_filtered_inputs(inputs, indices, check_same_num_examples=False, offset=0):
@@ -2338,7 +2317,7 @@ def init_buffer_and_writer():
                 )
             else:
                 buf_writer = None
-                logger.info("Caching processed dataset at %s", cache_file_name)
+                logger.info(f"Caching processed dataset at {cache_file_name}")
                 tmp_file = tempfile.NamedTemporaryFile("wb", dir=os.path.dirname(cache_file_name), delete=False)
                 writer = ArrowWriter(
                     features=writer_features,
@@ -2653,7 +2632,7 @@ def select(
             )
         else:
             buf_writer = None
-            logger.info("Caching indices mapping at %s", indices_cache_file_name)
+            logger.info(f"Caching indices mapping at {indices_cache_file_name}")
             tmp_file = tempfile.NamedTemporaryFile("wb", dir=os.path.dirname(indices_cache_file_name), delete=False)
             writer = ArrowWriter(
                 path=tmp_file.name, writer_batch_size=writer_batch_size, fingerprint=new_fingerprint, unit="indices"
@@ -2738,10 +2717,7 @@ def sort(
         # Check the column name
         if not isinstance(column, str) or column not in self._data.column_names:
             raise ValueError(
-                "Column '{}' not found in the dataset. Please provide a column selected in: {}".format(
-                    column,
-                    self._data.column_names,
-                )
+                f"Column '{column}' not found in the dataset. Please provide a column selected in: {self._data.column_names}"
             )
 
         # Check if we've already cached this computation (indexed by a hash)
@@ -2750,7 +2726,7 @@ def sort(
                 # we create a unique hash from the function, current dataset file and the mapping args
                 indices_cache_file_name = self._get_cache_file_path(new_fingerprint)
             if os.path.exists(indices_cache_file_name) and load_from_cache_file:
-                logger.warning("Loading cached sorted indices for dataset at %s", indices_cache_file_name)
+                logger.warning(f"Loading cached sorted indices for dataset at {indices_cache_file_name}")
                 return self._new_dataset_with_indices(
                     fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name
                 )
@@ -2833,7 +2809,7 @@ def shuffle(
                 # we create a unique hash from the function, current dataset file and the mapping args
                 indices_cache_file_name = self._get_cache_file_path(new_fingerprint)
             if os.path.exists(indices_cache_file_name) and load_from_cache_file:
-                logger.warning("Loading cached shuffled indices for dataset at %s", indices_cache_file_name)
+                logger.warning(f"Loading cached shuffled indices for dataset at {indices_cache_file_name}")
                 return self._new_dataset_with_indices(
                     fingerprint=new_fingerprint, indices_cache_file_name=indices_cache_file_name
                 )
@@ -3008,9 +2984,7 @@ def train_test_split(
                 and load_from_cache_file
             ):
                 logger.warning(
-                    "Loading cached split indices for dataset at %s and %s",
-                    train_indices_cache_file_name,
-                    test_indices_cache_file_name,
+                    f"Loading cached split indices for dataset at {train_indices_cache_file_name} and {test_indices_cache_file_name}"
                 )
                 return DatasetDict(
                     {