diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 5ef7324d743af..4ed7bd1bd5061 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -337,14 +337,15 @@ def main(): def tokenize_function(examples): return tokenizer(examples[text_column_name]) - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not args.overwrite_cache, - desc="Running tokenizer on dataset", - ) + with accelerator.main_process_first(): + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on dataset", + ) if args.block_size is None: block_size = tokenizer.model_max_length @@ -386,13 +387,14 @@ def group_texts(examples): # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - lm_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=args.preprocessing_num_workers, - load_from_cache_file=not args.overwrite_cache, - desc=f"Grouping texts in chunks of {block_size}", - ) + with accelerator.main_process_first(): + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=args.preprocessing_num_workers, + load_from_cache_file=not args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) train_dataset = lm_datasets["train"] eval_dataset = lm_datasets["validation"] diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 14a140d666bab..3e2241495b71e 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -374,14 +374,15 @@ def tokenize_function(examples): return_special_tokens_mask=True, ) - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=[text_column_name], - load_from_cache_file=not args.overwrite_cache, - desc="Running tokenizer on dataset line_by_line", - ) + with accelerator.main_process_first(): + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=[text_column_name], + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on dataset line_by_line", + ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more @@ -389,14 +390,15 @@ def tokenize_function(examples): def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not args.overwrite_cache, - desc="Running tokenizer on every text in dataset", - ) + with accelerator.main_process_first(): + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on every text in dataset", + ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. @@ -422,13 +424,14 @@ def group_texts(examples): # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - tokenized_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=args.preprocessing_num_workers, - load_from_cache_file=not args.overwrite_cache, - desc=f"Grouping texts in chunks of {max_seq_length}", - ) + with accelerator.main_process_first(): + tokenized_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=args.preprocessing_num_workers, + load_from_cache_file=not args.overwrite_cache, + desc=f"Grouping texts in chunks of {max_seq_length}", + ) train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation"] diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index afb8ef25f50bc..de8094ee35185 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -381,9 +381,10 @@ def preprocess_function(examples): tokenized_inputs["labels"] = labels return tokenized_inputs - processed_datasets = raw_datasets.map( - preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names - ) + with accelerator.main_process_first(): + processed_datasets = raw_datasets.map( + preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names + ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 17c324736b066..2210b01148286 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -440,14 +440,15 @@ def prepare_train_features(examples): # We will select sample from whole data if agument is specified train_dataset = train_dataset.select(range(args.max_train_samples)) # Create train feature from dataset - train_dataset = train_dataset.map( - prepare_train_features, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not args.overwrite_cache, - desc="Running tokenizer on train dataset", - ) + with accelerator.main_process_first(): + train_dataset = train_dataset.map( + prepare_train_features, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on train dataset", + ) if args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples train_dataset = train_dataset.select(range(args.max_train_samples)) @@ -530,14 +531,15 @@ def prepare_validation_features(examples): # We will select sample from whole data eval_examples = eval_examples.select(range(args.max_eval_samples)) # Validation Feature Creation - eval_dataset = eval_examples.map( - prepare_validation_features, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not args.overwrite_cache, - desc="Running tokenizer on validation dataset", - ) + with accelerator.main_process_first(): + eval_dataset = eval_examples.map( + prepare_validation_features, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on validation dataset", + ) if args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again @@ -551,17 +553,18 @@ def prepare_validation_features(examples): # We will select sample from whole data predict_examples = predict_examples.select(range(args.max_predict_samples)) # Predict Feature Creation - predict_dataset = predict_examples.map( - prepare_validation_features, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not args.overwrite_cache, - desc="Running tokenizer on prediction dataset", - ) - if args.max_predict_samples is not None: - # During Feature creation dataset samples might increase, we will select required samples again - predict_dataset = predict_dataset.select(range(args.max_predict_samples)) + with accelerator.main_process_first(): + predict_dataset = predict_examples.map( + prepare_validation_features, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on prediction dataset", + ) + if args.max_predict_samples is not None: + # During Feature creation dataset samples might increase, we will select required samples again + predict_dataset = predict_dataset.select(range(args.max_predict_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 920001ae087b5..81b119c058f34 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -468,18 +468,20 @@ def prepare_train_features(examples): if args.max_train_samples is not None: # We will select sample from whole data if agument is specified train_dataset = train_dataset.select(range(args.max_train_samples)) + # Create train feature from dataset - train_dataset = train_dataset.map( - prepare_train_features, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not args.overwrite_cache, - desc="Running tokenizer on train dataset", - ) - if args.max_train_samples is not None: - # Number of samples might increase during Feature Creation, We select only specified max samples - train_dataset = train_dataset.select(range(args.max_train_samples)) + with accelerator.main_process_first(): + train_dataset = train_dataset.map( + prepare_train_features, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on train dataset", + ) + if args.max_train_samples is not None: + # Number of samples might increase during Feature Creation, We select only specified max samples + train_dataset = train_dataset.select(range(args.max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): @@ -535,14 +537,15 @@ def prepare_validation_features(examples): # We will select sample from whole data eval_examples = eval_examples.select(range(args.max_eval_samples)) # Validation Feature Creation - eval_dataset = eval_examples.map( - prepare_validation_features, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not args.overwrite_cache, - desc="Running tokenizer on validation dataset", - ) + with accelerator.main_process_first(): + eval_dataset = eval_examples.map( + prepare_validation_features, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on validation dataset", + ) if args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again @@ -556,17 +559,18 @@ def prepare_validation_features(examples): # We will select sample from whole data predict_examples = predict_examples.select(range(args.max_predict_samples)) # Predict Feature Creation - predict_dataset = predict_examples.map( - prepare_validation_features, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not args.overwrite_cache, - desc="Running tokenizer on prediction dataset", - ) - if args.max_predict_samples is not None: - # During Feature creation dataset samples might increase, we will select required samples again - predict_dataset = predict_dataset.select(range(args.max_predict_samples)) + with accelerator.main_process_first(): + predict_dataset = predict_examples.map( + prepare_validation_features, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on prediction dataset", + ) + if args.max_predict_samples is not None: + # During Feature creation dataset samples might increase, we will select required samples again + predict_dataset = predict_dataset.select(range(args.max_predict_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 08a6292521032..e9f08afc9560f 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -439,13 +439,14 @@ def preprocess_function(examples): model_inputs["labels"] = labels["input_ids"] return model_inputs - processed_datasets = raw_datasets.map( - preprocess_function, - batched=True, - remove_columns=column_names, - load_from_cache_file=not args.overwrite_cache, - desc="Running tokenizer on dataset", - ) + with accelerator.main_process_first(): + processed_datasets = raw_datasets.map( + preprocess_function, + batched=True, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on dataset", + ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index fe9943b266a4f..ba2482e58a223 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -330,12 +330,13 @@ def preprocess_function(examples): result["labels"] = examples["label"] return result - processed_datasets = raw_datasets.map( - preprocess_function, - batched=True, - remove_columns=raw_datasets["train"].column_names, - desc="Running tokenizer on dataset", - ) + with accelerator.main_process_first(): + processed_datasets = raw_datasets.map( + preprocess_function, + batched=True, + remove_columns=raw_datasets["train"].column_names, + desc="Running tokenizer on dataset", + ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"] diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index b24f65ee987aa..756bdff55b5d7 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -403,12 +403,13 @@ def tokenize_and_align_labels(examples): tokenized_inputs["labels"] = labels return tokenized_inputs - processed_raw_datasets = raw_datasets.map( - tokenize_and_align_labels, - batched=True, - remove_columns=raw_datasets["train"].column_names, - desc="Running tokenizer on dataset", - ) + with accelerator.main_process_first(): + processed_raw_datasets = raw_datasets.map( + tokenize_and_align_labels, + batched=True, + remove_columns=raw_datasets["train"].column_names, + desc="Running tokenizer on dataset", + ) train_dataset = processed_raw_datasets["train"] eval_dataset = processed_raw_datasets["validation"] diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index d18b5cae8a361..4b5c0251314fa 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -418,14 +418,15 @@ def preprocess_function(examples): model_inputs["labels"] = labels["input_ids"] return model_inputs - processed_datasets = raw_datasets.map( - preprocess_function, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not args.overwrite_cache, - desc="Running tokenizer on dataset", - ) + with accelerator.main_process_first(): + processed_datasets = raw_datasets.map( + preprocess_function, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on dataset", + ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"]