Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

modify double considering special tokens in language_modeling.py #11275

Merged
merged 3 commits into from
Apr 19, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 3 additions & 4 deletions src/transformers/data/datasets/language_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,6 @@ def __init__(
)
assert os.path.isfile(file_path), f"Input file path {file_path} not found"

self.block_size = block_size - tokenizer.num_special_tokens_to_add(pair=True)
self.short_seq_probability = short_seq_probability
self.nsp_probability = nsp_probability

Expand Down Expand Up @@ -413,7 +412,7 @@ def __init__(
logger.info(f"Creating examples from {len(self.documents)} documents.")
self.examples = []
for doc_index, document in enumerate(self.documents):
self.create_examples_from_document(document, doc_index)
self.create_examples_from_document(document, doc_index, block_size)

start = time.time()
with open(cached_features_file, "wb") as handle:
Expand All @@ -422,10 +421,10 @@ def __init__(
f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
)

def create_examples_from_document(self, document: List[List[int]], doc_index: int):
def create_examples_from_document(self, document: List[List[int]], doc_index: int, block_size: int):
"""Creates examples for a single document."""

max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=True)
max_num_tokens = block_size - self.tokenizer.num_special_tokens_to_add(pair=True)

# We *usually* want to fill up the entire sequence since we are padding
# to `block_size` anyways, so short sequences are generally wasted
Expand Down