From 6d14bd240cb41b6d11b2234fbf25d88ca2bf9094 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 30 Mar 2022 11:03:29 -0700 Subject: [PATCH 1/2] [examples] max samples can't be bigger than then len of dataset --- .../pytorch/contrastive-image-text/run_clip.py | 9 ++++++--- examples/pytorch/language-modeling/run_clm.py | 6 ++++-- examples/pytorch/language-modeling/run_mlm.py | 6 ++++-- examples/pytorch/language-modeling/run_plm.py | 6 ++++-- examples/pytorch/multiple-choice/run_swag.py | 6 ++++-- examples/pytorch/question-answering/run_qa.py | 15 ++++++++++----- .../question-answering/run_qa_beam_search.py | 15 ++++++++++----- .../pytorch/question-answering/run_seq2seq_qa.py | 15 ++++++++++----- .../pytorch/summarization/run_summarization.py | 9 ++++++--- examples/pytorch/text-classification/run_glue.py | 9 ++++++--- examples/pytorch/text-classification/run_xnli.py | 9 ++++++--- examples/pytorch/token-classification/run_ner.py | 9 ++++++--- examples/pytorch/translation/run_translation.py | 9 ++++++--- 13 files changed, 82 insertions(+), 41 deletions(-) diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index 8a2ad8c55066..79fd123064a1 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -404,7 +404,8 @@ def filter_corrupt_images(examples): raise ValueError("--do_train requires a train dataset") train_dataset = dataset["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) train_dataset = train_dataset.filter( filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers @@ -426,7 +427,8 @@ def filter_corrupt_images(examples): raise ValueError("--do_eval requires a train validation") eval_dataset = dataset["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) eval_dataset = eval_dataset.filter( filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers @@ -448,7 +450,8 @@ def filter_corrupt_images(examples): raise ValueError("--do_predict requires a test dataset") test_dataset = dataset["test"] if data_args.max_eval_samples is not None: - test_dataset = test_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(test_dataset), data_args.max_eval_samples) + test_dataset = test_dataset.select(range(max_eval_samples)) test_dataset = test_dataset.filter( filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 5534e6901fb6..a1cdcf9ee4a9 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -445,14 +445,16 @@ def group_texts(examples): raise ValueError("--do_train requires a train dataset") train_dataset = lm_datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = lm_datasets["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) def preprocess_logits_for_metrics(logits, labels): if isinstance(logits, tuple): diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 7ceae8b17a8c..6ea3c2c934d3 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -468,14 +468,16 @@ def group_texts(examples): raise ValueError("--do_train requires a train dataset") train_dataset = tokenized_datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = tokenized_datasets["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) def preprocess_logits_for_metrics(logits, labels): if isinstance(logits, tuple): diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 1d7c42ba9c06..d1c09896d8e7 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -438,14 +438,16 @@ def group_texts(examples): raise ValueError("--do_train requires a train dataset") train_dataset = tokenized_datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = tokenized_datasets["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) # Data collator data_collator = DataCollatorForPermutationLanguageModeling( diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index eb9f52f4d54a..01c9e8bcf7d2 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -352,7 +352,8 @@ def preprocess_function(examples): raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, @@ -366,7 +367,8 @@ def preprocess_function(examples): raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 788cecae683f..67aaf1d84ff0 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -421,7 +421,8 @@ def prepare_train_features(examples): train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: # We will select sample from whole data if argument is specified - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) # Create train feature from dataset with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( @@ -434,7 +435,8 @@ def prepare_train_features(examples): ) if data_args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): @@ -489,7 +491,8 @@ def prepare_validation_features(examples): eval_examples = raw_datasets["validation"] if data_args.max_eval_samples is not None: # We will select sample from whole data - eval_examples = eval_examples.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) + eval_examples = eval_examples.select(range(max_eval_samples)) # Validation Feature Creation with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_examples.map( @@ -502,7 +505,8 @@ def prepare_validation_features(examples): ) if data_args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) if training_args.do_predict: if "test" not in raw_datasets: @@ -523,7 +527,8 @@ def prepare_validation_features(examples): ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) # Data collator # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index 13582c86d1b2..4c79be08b91b 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -432,7 +432,8 @@ def prepare_train_features(examples): train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: # Select samples from Dataset, This will help to decrease processing time - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) # Create Training Features with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( @@ -445,7 +446,8 @@ def prepare_train_features(examples): ) if data_args.max_train_samples is not None: # Select samples from dataset again since Feature Creation might increase number of features - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): @@ -519,7 +521,8 @@ def prepare_validation_features(examples): eval_examples = raw_datasets["validation"] if data_args.max_eval_samples is not None: # Selecting Eval Samples from Dataset - eval_examples = eval_examples.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) + eval_examples = eval_examples.select(range(max_eval_samples)) # Create Features from Eval Dataset with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_examples.map( @@ -532,7 +535,8 @@ def prepare_validation_features(examples): ) if data_args.max_eval_samples is not None: # Selecting Samples from Dataset again since Feature Creation might increase samples size - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) if training_args.do_predict: if "test" not in raw_datasets: @@ -553,7 +557,8 @@ def prepare_validation_features(examples): ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) # Data collator # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 071902a2ab5c..b4434102599c 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -489,7 +489,8 @@ def preprocess_validation_function(examples): train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: # We will select sample from whole data if agument is specified - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) # Create train feature from dataset with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( @@ -502,7 +503,8 @@ def preprocess_validation_function(examples): ) if data_args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: if "validation" not in raw_datasets: @@ -510,7 +512,8 @@ def preprocess_validation_function(examples): eval_examples = raw_datasets["validation"] if data_args.max_eval_samples is not None: # We will select sample from whole data - eval_examples = eval_examples.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) + eval_examples = eval_examples.select(range(max_eval_samples)) # Validation Feature Creation with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_examples.map( @@ -523,7 +526,8 @@ def preprocess_validation_function(examples): ) if data_args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) if training_args.do_predict: if "test" not in raw_datasets: @@ -544,7 +548,8 @@ def preprocess_validation_function(examples): ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) # Data collator label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 8d8e355e4f45..66aeb981bdf4 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -504,7 +504,8 @@ def preprocess_function(examples): raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, @@ -521,7 +522,8 @@ def preprocess_function(examples): raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, @@ -538,7 +540,8 @@ def preprocess_function(examples): raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) with training_args.main_process_first(desc="prediction dataset map pre-processing"): predict_dataset = predict_dataset.map( preprocess_function, diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 5f7ba4ca48c3..88be878faea2 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -415,21 +415,24 @@ def preprocess_function(examples): raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: if "validation" not in raw_datasets and "validation_matched" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None: if "test" not in raw_datasets and "test_matched" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"] if data_args.max_predict_samples is not None: - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) # Log a few random samples from the training set: if training_args.do_train: diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index 866d69687175..f54b1ec2aa60 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -279,7 +279,8 @@ def preprocess_function(examples): if training_args.do_train: if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, @@ -293,7 +294,8 @@ def preprocess_function(examples): if training_args.do_eval: if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, @@ -304,7 +306,8 @@ def preprocess_function(examples): if training_args.do_predict: if data_args.max_predict_samples is not None: - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) with training_args.main_process_first(desc="prediction dataset map pre-processing"): predict_dataset = predict_dataset.map( preprocess_function, diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index ca08efceb5c7..9ff64b37978c 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -431,7 +431,8 @@ def tokenize_and_align_labels(examples): raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( tokenize_and_align_labels, @@ -446,7 +447,8 @@ def tokenize_and_align_labels(examples): raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( tokenize_and_align_labels, @@ -461,7 +463,8 @@ def tokenize_and_align_labels(examples): raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) with training_args.main_process_first(desc="prediction dataset map pre-processing"): predict_dataset = predict_dataset.map( tokenize_and_align_labels, diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index ef10ff1890b9..b458a3f0cd65 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -433,7 +433,8 @@ def preprocess_function(examples): raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, @@ -450,7 +451,8 @@ def preprocess_function(examples): raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, @@ -467,7 +469,8 @@ def preprocess_function(examples): raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) with training_args.main_process_first(desc="prediction dataset map pre-processing"): predict_dataset = predict_dataset.map( preprocess_function, From 5f279b4f108c92aebfc4ece820ff6f9c93910e46 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 30 Mar 2022 12:02:19 -0700 Subject: [PATCH 2/2] do tf and flax --- .../image-captioning/run_image_captioning_flax.py | 9 ++++++--- examples/flax/language-modeling/run_clm_flax.py | 6 ++++-- examples/flax/question-answering/run_qa.py | 15 ++++++++++----- .../flax/summarization/run_summarization_flax.py | 9 ++++++--- .../jax-projects/model_parallel/run_clm_mp.py | 6 ++++-- .../quantization-qdqbert/run_quant_qa.py | 15 ++++++++++----- .../wav2vec2/run_common_voice.py | 3 ++- examples/tensorflow/language-modeling/run_clm.py | 6 ++++-- examples/tensorflow/language-modeling/run_mlm.py | 6 ++++-- examples/tensorflow/multiple-choice/run_swag.py | 6 ++++-- examples/tensorflow/question-answering/run_qa.py | 15 ++++++++++----- .../tensorflow/summarization/run_summarization.py | 6 ++++-- .../tensorflow/translation/run_translation.py | 6 ++++-- 13 files changed, 72 insertions(+), 36 deletions(-) diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py index 4e31da6b9ea9..b4b9afe0d305 100644 --- a/examples/flax/image-captioning/run_image_captioning_flax.py +++ b/examples/flax/image-captioning/run_image_captioning_flax.py @@ -613,7 +613,8 @@ def preprocess_fn(examples, max_target_length, check_image=True): raise ValueError("--do_train requires a train dataset") train_dataset = dataset["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) # remove problematic examples # (if feature extraction is performed at the beginning, the filtering is done during preprocessing below # instead here.) @@ -646,7 +647,8 @@ def preprocess_fn(examples, max_target_length, check_image=True): raise ValueError("--do_eval requires a validation dataset") eval_dataset = dataset["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) # remove problematic examples # (if feature extraction is performed at the beginning, the filtering is done during preprocessing below # instead here.) @@ -675,7 +677,8 @@ def preprocess_fn(examples, max_target_length, check_image=True): raise ValueError("--do_predict requires a test dataset") predict_dataset = dataset["test"] if data_args.max_predict_samples is not None: - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) # remove problematic examples # (if feature extraction is performed at the beginning, the filtering is done during preprocessing below # instead here.) diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py index bbcbc8268bd7..82a9757d5c26 100755 --- a/examples/flax/language-modeling/run_clm_flax.py +++ b/examples/flax/language-modeling/run_clm_flax.py @@ -527,14 +527,16 @@ def group_texts(examples): raise ValueError("--do_train requires a train dataset") train_dataset = lm_datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = lm_datasets["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) # Enable tensorboard only on the master node has_tensorboard = is_tensorboard_available() diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index 5d21e882e598..a15cca6607cc 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -602,7 +602,8 @@ def prepare_train_features(examples): train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: # We will select sample from whole data if agument is specified - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) # Create train feature from dataset train_dataset = train_dataset.map( prepare_train_features, @@ -613,7 +614,8 @@ def prepare_train_features(examples): ) if data_args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) processed_raw_datasets["train"] = train_dataset # Validation preprocessing @@ -669,7 +671,8 @@ def prepare_validation_features(examples): eval_examples = raw_datasets["validation"] if data_args.max_eval_samples is not None: # We will select sample from whole data - eval_examples = eval_examples.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) + eval_examples = eval_examples.select(range(max_eval_samples)) # Validation Feature Creation eval_dataset = eval_examples.map( prepare_validation_features, @@ -680,7 +683,8 @@ def prepare_validation_features(examples): ) if data_args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) processed_raw_datasets["validation"] = eval_dataset if training_args.do_predict: @@ -700,7 +704,8 @@ def prepare_validation_features(examples): ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) processed_raw_datasets["test"] = predict_dataset # endregion diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py index cfcf2e63dfce..effe3b58839f 100644 --- a/examples/flax/summarization/run_summarization_flax.py +++ b/examples/flax/summarization/run_summarization_flax.py @@ -547,7 +547,8 @@ def preprocess_function(examples): raise ValueError("--do_train requires a train dataset") train_dataset = dataset["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) train_dataset = train_dataset.map( preprocess_function, batched=True, @@ -563,7 +564,8 @@ def preprocess_function(examples): raise ValueError("--do_eval requires a validation dataset") eval_dataset = dataset["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) eval_dataset = eval_dataset.map( preprocess_function, batched=True, @@ -579,7 +581,8 @@ def preprocess_function(examples): raise ValueError("--do_predict requires a test dataset") predict_dataset = dataset["test"] if data_args.max_predict_samples is not None: - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) predict_dataset = predict_dataset.map( preprocess_function, batched=True, diff --git a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py index c56f10478f56..3371dc3bd4df 100644 --- a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py +++ b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py @@ -398,14 +398,16 @@ def group_texts(examples): raise ValueError("--do_train requires a train dataset") train_dataset = lm_datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = lm_datasets["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) # Enable tensorboard only on the master node has_tensorboard = is_tensorboard_available() diff --git a/examples/research_projects/quantization-qdqbert/run_quant_qa.py b/examples/research_projects/quantization-qdqbert/run_quant_qa.py index 01791681eff9..36bfb45c8ffc 100755 --- a/examples/research_projects/quantization-qdqbert/run_quant_qa.py +++ b/examples/research_projects/quantization-qdqbert/run_quant_qa.py @@ -434,7 +434,8 @@ def prepare_train_features(examples): train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: # We will select sample from whole data if agument is specified - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) # Create train feature from dataset with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( @@ -447,7 +448,8 @@ def prepare_train_features(examples): ) if data_args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): @@ -497,7 +499,8 @@ def prepare_validation_features(examples): eval_examples = raw_datasets["validation"] if data_args.max_eval_samples is not None: # We will select sample from whole data - eval_examples = eval_examples.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) + eval_examples = eval_examples.select(range(max_eval_samples)) # Validation Feature Creation with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_examples.map( @@ -510,7 +513,8 @@ def prepare_validation_features(examples): ) if data_args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) if training_args.do_predict: if "test" not in raw_datasets: @@ -531,7 +535,8 @@ def prepare_validation_features(examples): ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) # Data collator # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data diff --git a/examples/research_projects/wav2vec2/run_common_voice.py b/examples/research_projects/wav2vec2/run_common_voice.py index edae86641e0b..5825c1feb10b 100644 --- a/examples/research_projects/wav2vec2/run_common_voice.py +++ b/examples/research_projects/wav2vec2/run_common_voice.py @@ -375,7 +375,8 @@ def extract_all_chars(batch): ) if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_val_samples)) diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py index 0aad9949542f..4cbc00b3cdc9 100755 --- a/examples/tensorflow/language-modeling/run_clm.py +++ b/examples/tensorflow/language-modeling/run_clm.py @@ -415,9 +415,11 @@ def group_texts(examples): train_dataset = train_dataset.select(train_indices) if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py index b2e6a487e93e..44c5d230318b 100755 --- a/examples/tensorflow/language-modeling/run_mlm.py +++ b/examples/tensorflow/language-modeling/run_mlm.py @@ -456,9 +456,11 @@ def group_texts(examples): train_dataset = train_dataset.select(train_indices) if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index 0c1c62de26fa..e14815cf81f3 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -369,7 +369,8 @@ def preprocess_function(examples): train_dataset = raw_datasets["train"] non_label_columns = [feature for feature in train_dataset.features if feature not in ("label", "labels")] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, @@ -385,7 +386,8 @@ def preprocess_function(examples): if not training_args.do_train: non_label_columns = [feature for feature in eval_dataset.features if feature not in ("label", "labels")] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index 39437da46788..50e8c7f50d96 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -438,7 +438,8 @@ def prepare_train_features(examples): train_dataset = datasets["train"] if data_args.max_train_samples is not None: # We will select sample from whole data if agument is specified - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) # Create train feature from dataset train_dataset = train_dataset.map( prepare_train_features, @@ -449,7 +450,8 @@ def prepare_train_features(examples): ) if data_args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) processed_datasets["train"] = train_dataset # Validation preprocessing @@ -505,7 +507,8 @@ def prepare_validation_features(examples): eval_examples = datasets["validation"] if data_args.max_eval_samples is not None: # We will select sample from whole data - eval_examples = eval_examples.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) + eval_examples = eval_examples.select(range(max_eval_samples)) # Validation Feature Creation eval_dataset = eval_examples.map( prepare_validation_features, @@ -516,7 +519,8 @@ def prepare_validation_features(examples): ) if data_args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) processed_datasets["validation"] = eval_dataset if training_args.do_predict: @@ -536,7 +540,8 @@ def prepare_validation_features(examples): ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) processed_datasets["test"] = predict_dataset # endregion diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index 004f25157f06..e40c763530c0 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -490,7 +490,8 @@ def preprocess_function(examples): raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, @@ -509,7 +510,8 @@ def preprocess_function(examples): raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index ef997a6ed25f..fce150b712ad 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -445,7 +445,8 @@ def preprocess_function(examples): raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: - train_dataset = train_dataset.select(range(data_args.max_train_samples)) + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, @@ -464,7 +465,8 @@ def preprocess_function(examples): raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function,