Fix block_size picking in megatron_lm_gpt_pretraining.py (#2342)

Only cap `block_size` to 1024 if `tokenizer.model_max_length` is actually greater than 1024.
huggingface · Jan 18, 2024 · 14d7c3f · 14d7c3f
1 parent c7d11d7
commit 14d7c3f
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/examples/by_feature/megatron_lm_gpt_pretraining.py b/examples/by_feature/megatron_lm_gpt_pretraining.py
@@ -405,7 +405,7 @@ def tokenize_function(examples):
                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                 "Picking 1024 instead. You can change that default value by passing --block_size xxx."
             )
-        block_size = 1024
+            block_size = 1024
     else:
         if args.block_size > tokenizer.model_max_length:
             logger.warning(