Skip to content

Commit

Permalink
modify double considering special tokens in language_modeling.py (#…
Browse files Browse the repository at this point in the history
…11275)

* Update language_modeling.py

in "class TextDatasetForNextSentencePrediction(Dataset)", double considering "self.tokenizer.num_special_tokens_to_add(pair=True)" 

so, i remove self.block_size, and add parameter for "def create_examples_from_document". like "class LineByLineWithSOPTextDataset" do

* Update language_modeling.py
  • Loading branch information
taepd committed Apr 19, 2021
1 parent 5a34d8d commit 3981ce3
Showing 1 changed file with 3 additions and 4 deletions.
7 changes: 3 additions & 4 deletions src/transformers/data/datasets/language_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,6 @@ def __init__(
)
assert os.path.isfile(file_path), f"Input file path {file_path} not found"

self.block_size = block_size - tokenizer.num_special_tokens_to_add(pair=True)
self.short_seq_probability = short_seq_probability
self.nsp_probability = nsp_probability

Expand Down Expand Up @@ -413,7 +412,7 @@ def __init__(
logger.info(f"Creating examples from {len(self.documents)} documents.")
self.examples = []
for doc_index, document in enumerate(self.documents):
self.create_examples_from_document(document, doc_index)
self.create_examples_from_document(document, doc_index, block_size)

start = time.time()
with open(cached_features_file, "wb") as handle:
Expand All @@ -422,10 +421,10 @@ def __init__(
f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
)

def create_examples_from_document(self, document: List[List[int]], doc_index: int):
def create_examples_from_document(self, document: List[List[int]], doc_index: int, block_size: int):
"""Creates examples for a single document."""

max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=True)
max_num_tokens = block_size - self.tokenizer.num_special_tokens_to_add(pair=True)

# We *usually* want to fill up the entire sequence since we are padding
# to `block_size` anyways, so short sequences are generally wasted
Expand Down

0 comments on commit 3981ce3

Please sign in to comment.