diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py
index 10afcaf6e72a0..15d792ff3c9c1 100644
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -354,7 +354,6 @@ def __init__(
         )
         assert os.path.isfile(file_path), f"Input file path {file_path} not found"
 
-        self.block_size = block_size - tokenizer.num_special_tokens_to_add(pair=True)
         self.short_seq_probability = short_seq_probability
         self.nsp_probability = nsp_probability
 
@@ -413,7 +412,7 @@ def __init__(
                 logger.info(f"Creating examples from {len(self.documents)} documents.")
                 self.examples = []
                 for doc_index, document in enumerate(self.documents):
-                    self.create_examples_from_document(document, doc_index)
+                    self.create_examples_from_document(document, doc_index, block_size)
 
                 start = time.time()
                 with open(cached_features_file, "wb") as handle:
@@ -422,10 +421,10 @@ def __init__(
                     f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
                 )
 
-    def create_examples_from_document(self, document: List[List[int]], doc_index: int):
+    def create_examples_from_document(self, document: List[List[int]], doc_index: int, block_size: int):
         """Creates examples for a single document."""
 
-        max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=True)
+        max_num_tokens = block_size - self.tokenizer.num_special_tokens_to_add(pair=True)
 
         # We *usually* want to fill up the entire sequence since we are padding
         # to `block_size` anyways, so short sequences are generally wasted