From 7ae10f3d9889767a1adafddb148a509bdd299353 Mon Sep 17 00:00:00 2001 From: Niels Horn Date: Tue, 16 Jan 2024 11:53:55 +0100 Subject: [PATCH] Fix `block_size` picking in megatron_lm_gpt_pretraining.py Only cap `block_size` to 1024 if `tokenizer.model_max_length` is actually greater than 1024. --- examples/by_feature/megatron_lm_gpt_pretraining.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/by_feature/megatron_lm_gpt_pretraining.py b/examples/by_feature/megatron_lm_gpt_pretraining.py index b0e1b33700f..eb7efb3a79e 100644 --- a/examples/by_feature/megatron_lm_gpt_pretraining.py +++ b/examples/by_feature/megatron_lm_gpt_pretraining.py @@ -405,7 +405,7 @@ def tokenize_function(examples): f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --block_size xxx." ) - block_size = 1024 + block_size = 1024 else: if args.block_size > tokenizer.model_max_length: logger.warning(