From 66640c1bfad4d17e2398ecfcdcd9d7b6c7b2f254 Mon Sep 17 00:00:00 2001
From: hlz <hlz@pku.edu.cn>
Date: Fri, 21 Aug 2020 22:43:06 +0800
Subject: [PATCH 1/6] add datacollator and dataset for next sentence prediction
 task

---
 src/transformers/__init__.py                  |   2 +
 src/transformers/data/data_collator.py        | 174 ++++++++++++++++++
 src/transformers/data/datasets/__init__.py    |   2 +-
 .../data/datasets/language_modeling.py        |  80 ++++++++
 4 files changed, 257 insertions(+), 1 deletion(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index f2f4d7c2d11212..261350705245c2 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -445,11 +445,13 @@
         DataCollatorForLanguageModeling,
         DataCollatorForPermutationLanguageModeling,
         DataCollatorWithPadding,
+        DataCollatorForNextSentencePrediction,
     )
     from .data.datasets import (
         GlueDataset,
         TextDataset,
         LineByLineTextDataset,
+        TextDatasetForNextSentencePrediction,
         GlueDataTrainingArguments,
         SquadDataset,
         SquadDataTrainingArguments,
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 11b8535096af35..27bd279321be3f 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -1,3 +1,4 @@
+import random
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
 
@@ -313,3 +314,176 @@ def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor,
             ) & masked_indices[i]
 
         return inputs, perm_mask, target_mapping, labels
+
+
+@dataclass
+class DataCollatorForNextSentencePrediction:
+    """
+    Data collator used for language modeling.
+    - collates batches of tensors, honoring their tokenizer's pad_token
+    - preprocesses batches for masked language modeling
+    """
+
+    tokenizer: PreTrainedTokenizer
+    mlm: bool = True
+    block_size: int = 128
+    short_seq_probability: float = 0.1
+    nsp_probability: float = 0.5
+    mlm_probability: float = 0.15
+
+    def __call__(self, examples: List[List[List[int]]]) -> Dict[str, torch.Tensor]:
+        input_ids = []
+        segment_ids = []
+        nsp_labels = []
+
+        for i, doc in enumerate(examples):
+            input_id, segment_id, label = self.create_examples_from_document(doc, i, examples)
+            input_ids.extend(input_id)
+            segment_ids.extend(segment_id)
+            nsp_labels.extend(label)
+        if self.mlm:
+            input_ids, mlm_labels = self.mask_tokens(self._tensorize_batch(input_ids))
+        else:
+            input_ids = self._tensorize_batch(input_ids)
+
+        return {
+            "input_ids": input_ids,
+            "token_type_ids": self._tensorize_batch(segment_ids),
+            "masked_lm_labels": mlm_labels if self.mlm else None,
+            "next_sentence_label": torch.tensor(nsp_labels),
+        }
+
+    def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor:
+        length_of_first = examples[0].size(0)
+        are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
+        if are_tensors_same_length:
+            return torch.stack(examples, dim=0)
+        else:
+            if self.tokenizer._pad_token is None:
+                raise ValueError(
+                    "You are attempting to pad samples but the tokenizer you are using"
+                    f" ({self.tokenizer.__class__.__name__}) does not have one."
+                )
+            return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+
+    def create_examples_from_document(
+        self, document: List[List[int]], doc_index: int, examples: List[List[List[int]]]
+    ):
+        """Creates examples for a single document."""
+
+        max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=False)
+
+        # We *usually* want to fill up the entire sequence since we are padding
+        # to `block_size` anyways, so short sequences are generally wasted
+        # computation. However, we *sometimes*
+        # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+        # sequences to minimize the mismatch between pre-training and fine-tuning.
+        # The `target_seq_length` is just a rough target however, whereas
+        # `block_size` is a hard limit.
+        target_seq_length = max_num_tokens
+        if random.random() < self.short_seq_probability:
+            target_seq_length = random.randint(2, max_num_tokens)
+
+        current_chunk = []  # a buffer stored current working segments
+        current_length = 0
+        i = 0
+        input_ids = []
+        segment_ids = []
+        labels = []
+        while i < len(document):
+            segment = document[i]
+            current_chunk.append(segment)
+            current_length += len(segment)
+            if i == len(document) - 1 or current_length >= target_seq_length:
+                if current_chunk:
+                    # `a_end` is how many segments from `current_chunk` go into the `A`
+                    # (first) sentence.
+                    a_end = 1
+                    if len(current_chunk) >= 2:
+                        a_end = random.randint(1, len(current_chunk) - 1)
+
+                    tokens_a = []
+                    for j in range(a_end):
+                        tokens_a.extend(current_chunk[j])
+
+                    tokens_b = []
+
+                    if len(current_chunk) == 1 or random.random() < self.nsp_probability:
+                        is_random_next = True
+                        target_b_length = target_seq_length - len(tokens_a)
+
+                        # This should rarely go for more than one iteration for large
+                        # corpora. However, just to be careful, we try to make sure that
+                        # the random document is not the same as the document
+                        # we're processing.
+                        for _ in range(10):
+                            random_document_index = random.randint(0, len(examples) - 1)
+                            if random_document_index != doc_index:
+                                break
+
+                        random_document = examples[random_document_index]
+                        random_start = random.randint(0, len(random_document) - 1)
+                        for j in range(random_start, len(random_document)):
+                            tokens_b.extend(random_document[j])
+                            if len(tokens_b) >= target_b_length:
+                                break
+                        # We didn't actually use these segments so we "put them back" so
+                        # they don't go to waste.
+                        num_unused_segments = len(current_chunk) - a_end
+                        i -= num_unused_segments
+                    # Actual next
+                    else:
+                        is_random_next = False
+                        for j in range(a_end, len(current_chunk)):
+                            tokens_b.extend(current_chunk[j])
+
+                    assert len(tokens_a) >= 1
+                    assert len(tokens_b) >= 1
+
+                    input_ids.append(torch.tensor(self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)))
+                    segment_ids.append(
+                        torch.tensor(self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b))
+                    )
+                    labels.append(torch.tensor(1 if is_random_next else 0))
+
+                current_chunk = []
+                current_length = 0
+
+            i += 1
+
+        return input_ids, segment_ids, labels
+
+    def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
+            )
+
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+        ]
+        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
+        if self.tokenizer._pad_token is not None:
+            padding_mask = labels.eq(self.tokenizer.pad_token_id)
+            probability_matrix.masked_fill_(padding_mask, value=0.0)
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
diff --git a/src/transformers/data/datasets/__init__.py b/src/transformers/data/datasets/__init__.py
index ca2ab15e43fbeb..f4e2aac5e968c1 100644
--- a/src/transformers/data/datasets/__init__.py
+++ b/src/transformers/data/datasets/__init__.py
@@ -3,5 +3,5 @@
 # module, but to preserve other warnings. So, don't check this module at all.
 
 from .glue import GlueDataset, GlueDataTrainingArguments
-from .language_modeling import LineByLineTextDataset, TextDataset
+from .language_modeling import LineByLineTextDataset, TextDataset, TextDatasetForNextSentencePrediction
 from .squad import SquadDataset, SquadDataTrainingArguments
diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py
index 5a9aeb2225b56f..7daa430a88dbe4 100644
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -99,3 +99,83 @@ def __len__(self):
 
     def __getitem__(self, i) -> torch.Tensor:
         return torch.tensor(self.examples[i], dtype=torch.long)
+
+
+class TextDatasetForNextSentencePrediction(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach
+    soon.
+    """
+
+    def __init__(
+        self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False,
+    ):
+        assert os.path.isfile(file_path), f"Input file path {file_path} not found"
+
+        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
+
+        directory, filename = os.path.split(file_path)
+        cached_features_file = os.path.join(
+            directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
+        )
+
+        self.block_size = block_size
+        self.tokenizer = tokenizer
+        self.examples = []
+
+        # Make sure only the first process in distributed training processes the dataset,
+        # and the others will use the cache.
+        lock_path = cached_features_file + ".lock"
+
+        # Input file format:
+        # (1) One sentence per line. These should ideally be actual sentences, not
+        # entire paragraphs or arbitrary spans of text. (Because we use the
+        # sentence boundaries for the "next sentence prediction" task).
+        # (2) Blank lines between documents. Document boundaries are needed so
+        # that the "next sentence prediction" task doesn't span between documents.
+        #
+        # Example:
+        # I am very happy.
+        # Here is the second sentence.
+        #
+        # A new document.
+
+        with FileLock(lock_path):
+            if os.path.exists(cached_features_file) and not overwrite_cache:
+                start = time.time()
+                with open(cached_features_file, "rb") as handle:
+                    self.examples = pickle.load(handle)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+            else:
+                logger.info(f"Creating features from dataset file at {directory}")
+
+                self.examples = [[]]
+                with open(file_path, encoding="utf-8") as f:
+                    while True:
+                        line = f.readline()
+                        if not line:
+                            break
+                        line = line.strip()
+
+                        # Empty lines are used as document delimiters
+                        if not line:
+                            self.examples.append([])
+                        tokens = tokenizer.tokenize(line)
+                        tokens = tokenizer.convert_tokens_to_ids(tokens)
+                        if tokens:
+                            self.examples[-1].append(tokens)
+
+                start = time.time()
+                with open(cached_features_file, "wb") as handle:
+                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+                logger.info(
+                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
+                )
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i):
+        return self.examples[i]

From 9ab360fc73e4d2132f211d4749e399f16b1dac78 Mon Sep 17 00:00:00 2001
From: hlz <hlz@pku.edu.cn>
Date: Mon, 24 Aug 2020 21:48:21 +0800
Subject: [PATCH 2/6] bug fix (numbers of special tokens & truncate sequences)

---
 src/transformers/data/data_collator.py              | 9 ++++++++-
 src/transformers/data/datasets/language_modeling.py | 5 ++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 27bd279321be3f..d5b7b6522c4411 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -371,7 +371,7 @@ def create_examples_from_document(
     ):
         """Creates examples for a single document."""
 
-        max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=False)
+        max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=True)
 
         # We *usually* want to fill up the entire sequence since we are padding
         # to `block_size` anyways, so short sequences are generally wasted
@@ -440,6 +440,13 @@ def create_examples_from_document(
                     assert len(tokens_a) >= 1
                     assert len(tokens_b) >= 1
 
+                    tokens_a, tokens_b, _ = self.tokenizer.truncate_sequences(
+                        tokens_a,
+                        tokens_b,
+                        num_tokens_to_remove=len(tokens_a) + len(tokens_b) - max_num_tokens,
+                        truncation_strategy="longest_first",
+                    )
+
                     input_ids.append(torch.tensor(self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)))
                     segment_ids.append(
                         torch.tensor(self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b))
diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py
index 7daa430a88dbe4..c609a1a766cdd8 100644
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -112,14 +112,13 @@ def __init__(
     ):
         assert os.path.isfile(file_path), f"Input file path {file_path} not found"
 
-        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
+        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=True)
 
         directory, filename = os.path.split(file_path)
         cached_features_file = os.path.join(
             directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
         )
 
-        self.block_size = block_size
         self.tokenizer = tokenizer
         self.examples = []
 
@@ -160,7 +159,7 @@ def __init__(
                         line = line.strip()
 
                         # Empty lines are used as document delimiters
-                        if not line:
+                        if not line and len(self.examples[-1]) != 0:
                             self.examples.append([])
                         tokens = tokenizer.tokenize(line)
                         tokens = tokenizer.convert_tokens_to_ids(tokens)

From df92ad885ecd54570bf3a5493f0c06e72d5a1b6c Mon Sep 17 00:00:00 2001
From: hlz <hlz@pku.edu.cn>
Date: Mon, 24 Aug 2020 22:22:11 +0800
Subject: [PATCH 3/6] bug fix (+ dict inputs support for data collator)

---
 src/transformers/data/data_collator.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index d5b7b6522c4411..ab001b331a0a93 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -331,7 +331,10 @@ class DataCollatorForNextSentencePrediction:
     nsp_probability: float = 0.5
     mlm_probability: float = 0.15
 
-    def __call__(self, examples: List[List[List[int]]]) -> Dict[str, torch.Tensor]:
+    def __call__(self, examples: List[Union[List[List[int]], Dict[str, torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        if isinstance(examples[0], (dict, BatchEncoding)):
+            examples = [e["input_ids"] for e in examples]
+
         input_ids = []
         segment_ids = []
         nsp_labels = []

From c4e242df4c28ffb0cf56938c553066b4af6b9cd1 Mon Sep 17 00:00:00 2001
From: hlz <hlz@pku.edu.cn>
Date: Fri, 28 Aug 2020 11:48:25 +0800
Subject: [PATCH 4/6] add padding for nsp data collator; renamed cached files
 to avoid conflict.

---
 src/transformers/data/data_collator.py        | 28 ++++++++++++++-----
 .../data/datasets/language_modeling.py        |  2 +-
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index d1ed22ed1520cf..ceb36ed74f6e10 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -340,7 +340,7 @@ class DataCollatorForNextSentencePrediction:
 
     tokenizer: PreTrainedTokenizer
     mlm: bool = True
-    block_size: int = 128
+    block_size: int = 512
     short_seq_probability: float = 0.1
     nsp_probability: float = 0.5
     mlm_probability: float = 0.15
@@ -351,12 +351,14 @@ def __call__(self, examples: List[Union[List[List[int]], Dict[str, torch.Tensor]
 
         input_ids = []
         segment_ids = []
+        attention_masks = []
         nsp_labels = []
 
         for i, doc in enumerate(examples):
-            input_id, segment_id, label = self.create_examples_from_document(doc, i, examples)
+            input_id, segment_id, attention_mask, label = self.create_examples_from_document(doc, i, examples)
             input_ids.extend(input_id)
             segment_ids.extend(segment_id)
+            attention_masks.extend(attention_mask)
             nsp_labels.extend(label)
         if self.mlm:
             input_ids, mlm_labels = self.mask_tokens(self._tensorize_batch(input_ids))
@@ -365,6 +367,7 @@ def __call__(self, examples: List[Union[List[List[int]], Dict[str, torch.Tensor]
 
         return {
             "input_ids": input_ids,
+            "attention_mask": self._tensorize_batch(attention_masks),
             "token_type_ids": self._tensorize_batch(segment_ids),
             "masked_lm_labels": mlm_labels if self.mlm else None,
             "next_sentence_label": torch.tensor(nsp_labels),
@@ -406,6 +409,7 @@ def create_examples_from_document(
         i = 0
         input_ids = []
         segment_ids = []
+        attention_masks = []
         labels = []
         while i < len(document):
             segment = document[i]
@@ -464,10 +468,20 @@ def create_examples_from_document(
                         truncation_strategy="longest_first",
                     )
 
-                    input_ids.append(torch.tensor(self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)))
-                    segment_ids.append(
-                        torch.tensor(self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b))
-                    )
+                    input_id = self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
+                    attention_mask = [1] * len(input_id)
+                    segment_id = self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
+                    assert len(input_id) <= self.block_size
+
+                    # pad
+                    while len(input_id) < self.block_size:
+                        input_id.append(0)
+                        attention_mask.append(0)
+                        segment_id.append(0)
+
+                    input_ids.append(torch.tensor(input_id))
+                    segment_ids.append(torch.tensor(segment_id))
+                    attention_masks.append(torch.tensor(attention_mask))
                     labels.append(torch.tensor(1 if is_random_next else 0))
 
                 current_chunk = []
@@ -475,7 +489,7 @@ def create_examples_from_document(
 
             i += 1
 
-        return input_ids, segment_ids, labels
+        return input_ids, segment_ids, attention_masks, labels
 
     def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """
diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py
index 6c57843bdbd410..d4519aa4a6c1ba 100644
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -126,7 +126,7 @@ def __init__(
 
         directory, filename = os.path.split(file_path)
         cached_features_file = os.path.join(
-            directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
+            directory, "cached_nsp_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
         )
 
         self.tokenizer = tokenizer

From e053bf563f623dfc5aa1dfa90a78efde46a6e6a8 Mon Sep 17 00:00:00 2001
From: hlz <hlz@pku.edu.cn>
Date: Fri, 28 Aug 2020 11:48:37 +0800
Subject: [PATCH 5/6] add test for nsp data collator

---
 tests/test_data_collator.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/test_data_collator.py b/tests/test_data_collator.py
index 41b3b371b944e9..08f29e36d88150 100644
--- a/tests/test_data_collator.py
+++ b/tests/test_data_collator.py
@@ -10,10 +10,12 @@
     from transformers import (
         DataCollatorForLanguageModeling,
         DataCollatorForPermutationLanguageModeling,
+        DataCollatorForNextSentencePrediction,
         GlueDataset,
         GlueDataTrainingArguments,
         LineByLineTextDataset,
         TextDataset,
+        TextDatasetForNextSentencePrediction,
         default_data_collator,
     )
 
@@ -150,3 +152,19 @@ def test_plm(self):
         with self.assertRaises(ValueError):
             # Expect error due to odd sequence length
             data_collator(example)
+
+    def test_nsp(self):
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+        data_collator = DataCollatorForNextSentencePrediction(tokenizer)
+
+        dataset = TextDatasetForNextSentencePrediction(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512)
+        examples = [dataset[i] for i in range(len(dataset))]
+        batch = data_collator(examples)
+        self.assertIsInstance(batch, dict)
+
+        # Since there are randomly generated false samples, the total number of samples is not fixed.
+        total_samples = batch["input_ids"].shape[0]
+        self.assertEqual(batch["input_ids"].shape, torch.Size((total_samples, 512)))
+        self.assertEqual(batch["token_type_ids"].shape, torch.Size((total_samples, 512)))
+        self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((total_samples, 512)))
+        self.assertEqual(batch["next_sentence_label"].shape, torch.Size((total_samples,)))

From d04da12d847fb106aae1d43995170c9aa2a8bb6f Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 31 Aug 2020 14:24:34 +0200
Subject: [PATCH 6/6] Style

---
 src/transformers/__init__.py                        | 10 +++++-----
 src/transformers/data/datasets/language_modeling.py | 13 +++++++++++--
 tests/test_data_collator.py                         |  2 +-
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 7f8b6194eb7efb..502da785557688 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -198,21 +198,21 @@
     from .benchmark.benchmark import PyTorchBenchmark
     from .benchmark.benchmark_args import PyTorchBenchmarkArguments
     from .data.data_collator import (
-        default_data_collator,
         DataCollator,
         DataCollatorForLanguageModeling,
+        DataCollatorForNextSentencePrediction,
         DataCollatorForPermutationLanguageModeling,
         DataCollatorWithPadding,
-        DataCollatorForNextSentencePrediction,
+        default_data_collator,
     )
     from .data.datasets import (
         GlueDataset,
-        TextDataset,
-        LineByLineTextDataset,
-        TextDatasetForNextSentencePrediction,
         GlueDataTrainingArguments,
+        LineByLineTextDataset,
         SquadDataset,
         SquadDataTrainingArguments,
+        TextDataset,
+        TextDatasetForNextSentencePrediction,
     )
     from .generation_utils import top_k_top_p_filtering
     from .modeling_albert import (
diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py
index d4519aa4a6c1ba..1a377a60b155b0 100644
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -118,7 +118,11 @@ class TextDatasetForNextSentencePrediction(Dataset):
     """
 
     def __init__(
-        self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False,
+        self,
+        tokenizer: PreTrainedTokenizer,
+        file_path: str,
+        block_size: int,
+        overwrite_cache=False,
     ):
         assert os.path.isfile(file_path), f"Input file path {file_path} not found"
 
@@ -126,7 +130,12 @@ def __init__(
 
         directory, filename = os.path.split(file_path)
         cached_features_file = os.path.join(
-            directory, "cached_nsp_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
+            directory,
+            "cached_nsp_{}_{}_{}".format(
+                tokenizer.__class__.__name__,
+                str(block_size),
+                filename,
+            ),
         )
 
         self.tokenizer = tokenizer
diff --git a/tests/test_data_collator.py b/tests/test_data_collator.py
index 08f29e36d88150..2ec65e573807d0 100644
--- a/tests/test_data_collator.py
+++ b/tests/test_data_collator.py
@@ -9,8 +9,8 @@
 
     from transformers import (
         DataCollatorForLanguageModeling,
-        DataCollatorForPermutationLanguageModeling,
         DataCollatorForNextSentencePrediction,
+        DataCollatorForPermutationLanguageModeling,
         GlueDataset,
         GlueDataTrainingArguments,
         LineByLineTextDataset,