Skip to content

Commit

Permalink
map only on one process (#13810)
Browse files Browse the repository at this point in the history
  • Loading branch information
patrickvonplaten committed Sep 30, 2021
1 parent 9a9805f commit 44eb8bd
Show file tree
Hide file tree
Showing 9 changed files with 142 additions and 125 deletions.
32 changes: 17 additions & 15 deletions examples/pytorch/language-modeling/run_clm_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,14 +337,15 @@ def main():
def tokenize_function(examples):
return tokenizer(examples[text_column_name])

tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset",
)
with accelerator.main_process_first():
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset",
)

if args.block_size is None:
block_size = tokenizer.model_max_length
Expand Down Expand Up @@ -386,13 +387,14 @@ def group_texts(examples):
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

lm_datasets = tokenized_datasets.map(
group_texts,
batched=True,
num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache,
desc=f"Grouping texts in chunks of {block_size}",
)
with accelerator.main_process_first():
lm_datasets = tokenized_datasets.map(
group_texts,
batched=True,
num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache,
desc=f"Grouping texts in chunks of {block_size}",
)

train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]
Expand Down
49 changes: 26 additions & 23 deletions examples/pytorch/language-modeling/run_mlm_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,29 +374,31 @@ def tokenize_function(examples):
return_special_tokens_mask=True,
)

tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=[text_column_name],
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset line_by_line",
)
with accelerator.main_process_first():
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=[text_column_name],
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset line_by_line",
)
else:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
# efficient when it receives the `special_tokens_mask`.
def tokenize_function(examples):
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)

tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on every text in dataset",
)
with accelerator.main_process_first():
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on every text in dataset",
)

# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
Expand All @@ -422,13 +424,14 @@ def group_texts(examples):
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

tokenized_datasets = tokenized_datasets.map(
group_texts,
batched=True,
num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache,
desc=f"Grouping texts in chunks of {max_seq_length}",
)
with accelerator.main_process_first():
tokenized_datasets = tokenized_datasets.map(
group_texts,
batched=True,
num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache,
desc=f"Grouping texts in chunks of {max_seq_length}",
)

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]
Expand Down
7 changes: 4 additions & 3 deletions examples/pytorch/multiple-choice/run_swag_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,9 +381,10 @@ def preprocess_function(examples):
tokenized_inputs["labels"] = labels
return tokenized_inputs

processed_datasets = raw_datasets.map(
preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names
)
with accelerator.main_process_first():
processed_datasets = raw_datasets.map(
preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -440,14 +440,15 @@ def prepare_train_features(examples):
# We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(args.max_train_samples))
# Create train feature from dataset
train_dataset = train_dataset.map(
prepare_train_features,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on train dataset",
)
with accelerator.main_process_first():
train_dataset = train_dataset.map(
prepare_train_features,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on train dataset",
)
if args.max_train_samples is not None:
# Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset = train_dataset.select(range(args.max_train_samples))
Expand Down Expand Up @@ -530,14 +531,15 @@ def prepare_validation_features(examples):
# We will select sample from whole data
eval_examples = eval_examples.select(range(args.max_eval_samples))
# Validation Feature Creation
eval_dataset = eval_examples.map(
prepare_validation_features,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on validation dataset",
)
with accelerator.main_process_first():
eval_dataset = eval_examples.map(
prepare_validation_features,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on validation dataset",
)

if args.max_eval_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again
Expand All @@ -551,17 +553,18 @@ def prepare_validation_features(examples):
# We will select sample from whole data
predict_examples = predict_examples.select(range(args.max_predict_samples))
# Predict Feature Creation
predict_dataset = predict_examples.map(
prepare_validation_features,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on prediction dataset",
)
if args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(args.max_predict_samples))
with accelerator.main_process_first():
predict_dataset = predict_examples.map(
prepare_validation_features,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on prediction dataset",
)
if args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(args.max_predict_samples))

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
Expand Down
64 changes: 34 additions & 30 deletions examples/pytorch/question-answering/run_qa_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,18 +468,20 @@ def prepare_train_features(examples):
if args.max_train_samples is not None:
# We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(args.max_train_samples))

# Create train feature from dataset
train_dataset = train_dataset.map(
prepare_train_features,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on train dataset",
)
if args.max_train_samples is not None:
# Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset = train_dataset.select(range(args.max_train_samples))
with accelerator.main_process_first():
train_dataset = train_dataset.map(
prepare_train_features,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on train dataset",
)
if args.max_train_samples is not None:
# Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset = train_dataset.select(range(args.max_train_samples))

# Validation preprocessing
def prepare_validation_features(examples):
Expand Down Expand Up @@ -535,14 +537,15 @@ def prepare_validation_features(examples):
# We will select sample from whole data
eval_examples = eval_examples.select(range(args.max_eval_samples))
# Validation Feature Creation
eval_dataset = eval_examples.map(
prepare_validation_features,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on validation dataset",
)
with accelerator.main_process_first():
eval_dataset = eval_examples.map(
prepare_validation_features,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on validation dataset",
)

if args.max_eval_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again
Expand All @@ -556,17 +559,18 @@ def prepare_validation_features(examples):
# We will select sample from whole data
predict_examples = predict_examples.select(range(args.max_predict_samples))
# Predict Feature Creation
predict_dataset = predict_examples.map(
prepare_validation_features,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on prediction dataset",
)
if args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(args.max_predict_samples))
with accelerator.main_process_first():
predict_dataset = predict_examples.map(
prepare_validation_features,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on prediction dataset",
)
if args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(args.max_predict_samples))

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
Expand Down
15 changes: 8 additions & 7 deletions examples/pytorch/summarization/run_summarization_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,13 +439,14 @@ def preprocess_function(examples):
model_inputs["labels"] = labels["input_ids"]
return model_inputs

processed_datasets = raw_datasets.map(
preprocess_function,
batched=True,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset",
)
with accelerator.main_process_first():
processed_datasets = raw_datasets.map(
preprocess_function,
batched=True,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]
Expand Down
13 changes: 7 additions & 6 deletions examples/pytorch/text-classification/run_glue_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,12 +330,13 @@ def preprocess_function(examples):
result["labels"] = examples["label"]
return result

processed_datasets = raw_datasets.map(
preprocess_function,
batched=True,
remove_columns=raw_datasets["train"].column_names,
desc="Running tokenizer on dataset",
)
with accelerator.main_process_first():
processed_datasets = raw_datasets.map(
preprocess_function,
batched=True,
remove_columns=raw_datasets["train"].column_names,
desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"]
Expand Down
13 changes: 7 additions & 6 deletions examples/pytorch/token-classification/run_ner_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,12 +403,13 @@ def tokenize_and_align_labels(examples):
tokenized_inputs["labels"] = labels
return tokenized_inputs

processed_raw_datasets = raw_datasets.map(
tokenize_and_align_labels,
batched=True,
remove_columns=raw_datasets["train"].column_names,
desc="Running tokenizer on dataset",
)
with accelerator.main_process_first():
processed_raw_datasets = raw_datasets.map(
tokenize_and_align_labels,
batched=True,
remove_columns=raw_datasets["train"].column_names,
desc="Running tokenizer on dataset",
)

train_dataset = processed_raw_datasets["train"]
eval_dataset = processed_raw_datasets["validation"]
Expand Down
17 changes: 9 additions & 8 deletions examples/pytorch/translation/run_translation_no_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,14 +418,15 @@ def preprocess_function(examples):
model_inputs["labels"] = labels["input_ids"]
return model_inputs

processed_datasets = raw_datasets.map(
preprocess_function,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset",
)
with accelerator.main_process_first():
processed_datasets = raw_datasets.map(
preprocess_function,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]
Expand Down

0 comments on commit 44eb8bd

Please sign in to comment.