From 66640c1bfad4d17e2398ecfcdcd9d7b6c7b2f254 Mon Sep 17 00:00:00 2001 From: hlz Date: Fri, 21 Aug 2020 22:43:06 +0800 Subject: [PATCH 1/6] add datacollator and dataset for next sentence prediction task --- src/transformers/__init__.py | 2 + src/transformers/data/data_collator.py | 174 ++++++++++++++++++ src/transformers/data/datasets/__init__.py | 2 +- .../data/datasets/language_modeling.py | 80 ++++++++ 4 files changed, 257 insertions(+), 1 deletion(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index f2f4d7c2d11212..261350705245c2 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -445,11 +445,13 @@ DataCollatorForLanguageModeling, DataCollatorForPermutationLanguageModeling, DataCollatorWithPadding, + DataCollatorForNextSentencePrediction, ) from .data.datasets import ( GlueDataset, TextDataset, LineByLineTextDataset, + TextDatasetForNextSentencePrediction, GlueDataTrainingArguments, SquadDataset, SquadDataTrainingArguments, diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index 11b8535096af35..27bd279321be3f 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -1,3 +1,4 @@ +import random from dataclasses import dataclass from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union @@ -313,3 +314,176 @@ def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, ) & masked_indices[i] return inputs, perm_mask, target_mapping, labels + + +@dataclass +class DataCollatorForNextSentencePrediction: + """ + Data collator used for language modeling. + - collates batches of tensors, honoring their tokenizer's pad_token + - preprocesses batches for masked language modeling + """ + + tokenizer: PreTrainedTokenizer + mlm: bool = True + block_size: int = 128 + short_seq_probability: float = 0.1 + nsp_probability: float = 0.5 + mlm_probability: float = 0.15 + + def __call__(self, examples: List[List[List[int]]]) -> Dict[str, torch.Tensor]: + input_ids = [] + segment_ids = [] + nsp_labels = [] + + for i, doc in enumerate(examples): + input_id, segment_id, label = self.create_examples_from_document(doc, i, examples) + input_ids.extend(input_id) + segment_ids.extend(segment_id) + nsp_labels.extend(label) + if self.mlm: + input_ids, mlm_labels = self.mask_tokens(self._tensorize_batch(input_ids)) + else: + input_ids = self._tensorize_batch(input_ids) + + return { + "input_ids": input_ids, + "token_type_ids": self._tensorize_batch(segment_ids), + "masked_lm_labels": mlm_labels if self.mlm else None, + "next_sentence_label": torch.tensor(nsp_labels), + } + + def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor: + length_of_first = examples[0].size(0) + are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) + if are_tensors_same_length: + return torch.stack(examples, dim=0) + else: + if self.tokenizer._pad_token is None: + raise ValueError( + "You are attempting to pad samples but the tokenizer you are using" + f" ({self.tokenizer.__class__.__name__}) does not have one." + ) + return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id) + + def create_examples_from_document( + self, document: List[List[int]], doc_index: int, examples: List[List[List[int]]] + ): + """Creates examples for a single document.""" + + max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=False) + + # We *usually* want to fill up the entire sequence since we are padding + # to `block_size` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `block_size` is a hard limit. + target_seq_length = max_num_tokens + if random.random() < self.short_seq_probability: + target_seq_length = random.randint(2, max_num_tokens) + + current_chunk = [] # a buffer stored current working segments + current_length = 0 + i = 0 + input_ids = [] + segment_ids = [] + labels = [] + while i < len(document): + segment = document[i] + current_chunk.append(segment) + current_length += len(segment) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: + a_end = random.randint(1, len(current_chunk) - 1) + + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + tokens_b = [] + + if len(current_chunk) == 1 or random.random() < self.nsp_probability: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + for _ in range(10): + random_document_index = random.randint(0, len(examples) - 1) + if random_document_index != doc_index: + break + + random_document = examples[random_document_index] + random_start = random.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + input_ids.append(torch.tensor(self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b))) + segment_ids.append( + torch.tensor(self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)) + ) + labels.append(torch.tensor(1 if is_random_next else 0)) + + current_chunk = [] + current_length = 0 + + i += 1 + + return input_ids, segment_ids, labels + + def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. + """ + + if self.tokenizer.mask_token is None: + raise ValueError( + "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." + ) + + labels = inputs.clone() + # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) + probability_matrix = torch.full(labels.shape, self.mlm_probability) + special_tokens_mask = [ + self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() + ] + probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) + if self.tokenizer._pad_token is not None: + padding_mask = labels.eq(self.tokenizer.pad_token_id) + probability_matrix.masked_fill_(padding_mask, value=0.0) + masked_indices = torch.bernoulli(probability_matrix).bool() + labels[~masked_indices] = -100 # We only compute loss on masked tokens + + # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) + indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices + inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) + + # 10% of the time, we replace masked input tokens with random word + indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced + random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) + inputs[indices_random] = random_words[indices_random] + + # The rest of the time (10% of the time) we keep the masked input tokens unchanged + return inputs, labels diff --git a/src/transformers/data/datasets/__init__.py b/src/transformers/data/datasets/__init__.py index ca2ab15e43fbeb..f4e2aac5e968c1 100644 --- a/src/transformers/data/datasets/__init__.py +++ b/src/transformers/data/datasets/__init__.py @@ -3,5 +3,5 @@ # module, but to preserve other warnings. So, don't check this module at all. from .glue import GlueDataset, GlueDataTrainingArguments -from .language_modeling import LineByLineTextDataset, TextDataset +from .language_modeling import LineByLineTextDataset, TextDataset, TextDatasetForNextSentencePrediction from .squad import SquadDataset, SquadDataTrainingArguments diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py index 5a9aeb2225b56f..7daa430a88dbe4 100644 --- a/src/transformers/data/datasets/language_modeling.py +++ b/src/transformers/data/datasets/language_modeling.py @@ -99,3 +99,83 @@ def __len__(self): def __getitem__(self, i) -> torch.Tensor: return torch.tensor(self.examples[i], dtype=torch.long) + + +class TextDatasetForNextSentencePrediction(Dataset): + """ + This will be superseded by a framework-agnostic approach + soon. + """ + + def __init__( + self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, + ): + assert os.path.isfile(file_path), f"Input file path {file_path} not found" + + block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False) + + directory, filename = os.path.split(file_path) + cached_features_file = os.path.join( + directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,), + ) + + self.block_size = block_size + self.tokenizer = tokenizer + self.examples = [] + + # Make sure only the first process in distributed training processes the dataset, + # and the others will use the cache. + lock_path = cached_features_file + ".lock" + + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + # + # Example: + # I am very happy. + # Here is the second sentence. + # + # A new document. + + with FileLock(lock_path): + if os.path.exists(cached_features_file) and not overwrite_cache: + start = time.time() + with open(cached_features_file, "rb") as handle: + self.examples = pickle.load(handle) + logger.info( + f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start + ) + else: + logger.info(f"Creating features from dataset file at {directory}") + + self.examples = [[]] + with open(file_path, encoding="utf-8") as f: + while True: + line = f.readline() + if not line: + break + line = line.strip() + + # Empty lines are used as document delimiters + if not line: + self.examples.append([]) + tokens = tokenizer.tokenize(line) + tokens = tokenizer.convert_tokens_to_ids(tokens) + if tokens: + self.examples[-1].append(tokens) + + start = time.time() + with open(cached_features_file, "wb") as handle: + pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) + logger.info( + "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start + ) + + def __len__(self): + return len(self.examples) + + def __getitem__(self, i): + return self.examples[i] From 9ab360fc73e4d2132f211d4749e399f16b1dac78 Mon Sep 17 00:00:00 2001 From: hlz Date: Mon, 24 Aug 2020 21:48:21 +0800 Subject: [PATCH 2/6] bug fix (numbers of special tokens & truncate sequences) --- src/transformers/data/data_collator.py | 9 ++++++++- src/transformers/data/datasets/language_modeling.py | 5 ++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index 27bd279321be3f..d5b7b6522c4411 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -371,7 +371,7 @@ def create_examples_from_document( ): """Creates examples for a single document.""" - max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=False) + max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=True) # We *usually* want to fill up the entire sequence since we are padding # to `block_size` anyways, so short sequences are generally wasted @@ -440,6 +440,13 @@ def create_examples_from_document( assert len(tokens_a) >= 1 assert len(tokens_b) >= 1 + tokens_a, tokens_b, _ = self.tokenizer.truncate_sequences( + tokens_a, + tokens_b, + num_tokens_to_remove=len(tokens_a) + len(tokens_b) - max_num_tokens, + truncation_strategy="longest_first", + ) + input_ids.append(torch.tensor(self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b))) segment_ids.append( torch.tensor(self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)) diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py index 7daa430a88dbe4..c609a1a766cdd8 100644 --- a/src/transformers/data/datasets/language_modeling.py +++ b/src/transformers/data/datasets/language_modeling.py @@ -112,14 +112,13 @@ def __init__( ): assert os.path.isfile(file_path), f"Input file path {file_path} not found" - block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False) + block_size = block_size - tokenizer.num_special_tokens_to_add(pair=True) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,), ) - self.block_size = block_size self.tokenizer = tokenizer self.examples = [] @@ -160,7 +159,7 @@ def __init__( line = line.strip() # Empty lines are used as document delimiters - if not line: + if not line and len(self.examples[-1]) != 0: self.examples.append([]) tokens = tokenizer.tokenize(line) tokens = tokenizer.convert_tokens_to_ids(tokens) From df92ad885ecd54570bf3a5493f0c06e72d5a1b6c Mon Sep 17 00:00:00 2001 From: hlz Date: Mon, 24 Aug 2020 22:22:11 +0800 Subject: [PATCH 3/6] bug fix (+ dict inputs support for data collator) --- src/transformers/data/data_collator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index d5b7b6522c4411..ab001b331a0a93 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -331,7 +331,10 @@ class DataCollatorForNextSentencePrediction: nsp_probability: float = 0.5 mlm_probability: float = 0.15 - def __call__(self, examples: List[List[List[int]]]) -> Dict[str, torch.Tensor]: + def __call__(self, examples: List[Union[List[List[int]], Dict[str, torch.Tensor]]]) -> Dict[str, torch.Tensor]: + if isinstance(examples[0], (dict, BatchEncoding)): + examples = [e["input_ids"] for e in examples] + input_ids = [] segment_ids = [] nsp_labels = [] From c4e242df4c28ffb0cf56938c553066b4af6b9cd1 Mon Sep 17 00:00:00 2001 From: hlz Date: Fri, 28 Aug 2020 11:48:25 +0800 Subject: [PATCH 4/6] add padding for nsp data collator; renamed cached files to avoid conflict. --- src/transformers/data/data_collator.py | 28 ++++++++++++++----- .../data/datasets/language_modeling.py | 2 +- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index d1ed22ed1520cf..ceb36ed74f6e10 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -340,7 +340,7 @@ class DataCollatorForNextSentencePrediction: tokenizer: PreTrainedTokenizer mlm: bool = True - block_size: int = 128 + block_size: int = 512 short_seq_probability: float = 0.1 nsp_probability: float = 0.5 mlm_probability: float = 0.15 @@ -351,12 +351,14 @@ def __call__(self, examples: List[Union[List[List[int]], Dict[str, torch.Tensor] input_ids = [] segment_ids = [] + attention_masks = [] nsp_labels = [] for i, doc in enumerate(examples): - input_id, segment_id, label = self.create_examples_from_document(doc, i, examples) + input_id, segment_id, attention_mask, label = self.create_examples_from_document(doc, i, examples) input_ids.extend(input_id) segment_ids.extend(segment_id) + attention_masks.extend(attention_mask) nsp_labels.extend(label) if self.mlm: input_ids, mlm_labels = self.mask_tokens(self._tensorize_batch(input_ids)) @@ -365,6 +367,7 @@ def __call__(self, examples: List[Union[List[List[int]], Dict[str, torch.Tensor] return { "input_ids": input_ids, + "attention_mask": self._tensorize_batch(attention_masks), "token_type_ids": self._tensorize_batch(segment_ids), "masked_lm_labels": mlm_labels if self.mlm else None, "next_sentence_label": torch.tensor(nsp_labels), @@ -406,6 +409,7 @@ def create_examples_from_document( i = 0 input_ids = [] segment_ids = [] + attention_masks = [] labels = [] while i < len(document): segment = document[i] @@ -464,10 +468,20 @@ def create_examples_from_document( truncation_strategy="longest_first", ) - input_ids.append(torch.tensor(self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b))) - segment_ids.append( - torch.tensor(self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)) - ) + input_id = self.tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b) + attention_mask = [1] * len(input_id) + segment_id = self.tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b) + assert len(input_id) <= self.block_size + + # pad + while len(input_id) < self.block_size: + input_id.append(0) + attention_mask.append(0) + segment_id.append(0) + + input_ids.append(torch.tensor(input_id)) + segment_ids.append(torch.tensor(segment_id)) + attention_masks.append(torch.tensor(attention_mask)) labels.append(torch.tensor(1 if is_random_next else 0)) current_chunk = [] @@ -475,7 +489,7 @@ def create_examples_from_document( i += 1 - return input_ids, segment_ids, labels + return input_ids, segment_ids, attention_masks, labels def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py index 6c57843bdbd410..d4519aa4a6c1ba 100644 --- a/src/transformers/data/datasets/language_modeling.py +++ b/src/transformers/data/datasets/language_modeling.py @@ -126,7 +126,7 @@ def __init__( directory, filename = os.path.split(file_path) cached_features_file = os.path.join( - directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,), + directory, "cached_nsp_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,), ) self.tokenizer = tokenizer From e053bf563f623dfc5aa1dfa90a78efde46a6e6a8 Mon Sep 17 00:00:00 2001 From: hlz Date: Fri, 28 Aug 2020 11:48:37 +0800 Subject: [PATCH 5/6] add test for nsp data collator --- tests/test_data_collator.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_data_collator.py b/tests/test_data_collator.py index 41b3b371b944e9..08f29e36d88150 100644 --- a/tests/test_data_collator.py +++ b/tests/test_data_collator.py @@ -10,10 +10,12 @@ from transformers import ( DataCollatorForLanguageModeling, DataCollatorForPermutationLanguageModeling, + DataCollatorForNextSentencePrediction, GlueDataset, GlueDataTrainingArguments, LineByLineTextDataset, TextDataset, + TextDatasetForNextSentencePrediction, default_data_collator, ) @@ -150,3 +152,19 @@ def test_plm(self): with self.assertRaises(ValueError): # Expect error due to odd sequence length data_collator(example) + + def test_nsp(self): + tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + data_collator = DataCollatorForNextSentencePrediction(tokenizer) + + dataset = TextDatasetForNextSentencePrediction(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512) + examples = [dataset[i] for i in range(len(dataset))] + batch = data_collator(examples) + self.assertIsInstance(batch, dict) + + # Since there are randomly generated false samples, the total number of samples is not fixed. + total_samples = batch["input_ids"].shape[0] + self.assertEqual(batch["input_ids"].shape, torch.Size((total_samples, 512))) + self.assertEqual(batch["token_type_ids"].shape, torch.Size((total_samples, 512))) + self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((total_samples, 512))) + self.assertEqual(batch["next_sentence_label"].shape, torch.Size((total_samples,))) From d04da12d847fb106aae1d43995170c9aa2a8bb6f Mon Sep 17 00:00:00 2001 From: Lysandre Date: Mon, 31 Aug 2020 14:24:34 +0200 Subject: [PATCH 6/6] Style --- src/transformers/__init__.py | 10 +++++----- src/transformers/data/datasets/language_modeling.py | 13 +++++++++++-- tests/test_data_collator.py | 2 +- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 7f8b6194eb7efb..502da785557688 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -198,21 +198,21 @@ from .benchmark.benchmark import PyTorchBenchmark from .benchmark.benchmark_args import PyTorchBenchmarkArguments from .data.data_collator import ( - default_data_collator, DataCollator, DataCollatorForLanguageModeling, + DataCollatorForNextSentencePrediction, DataCollatorForPermutationLanguageModeling, DataCollatorWithPadding, - DataCollatorForNextSentencePrediction, + default_data_collator, ) from .data.datasets import ( GlueDataset, - TextDataset, - LineByLineTextDataset, - TextDatasetForNextSentencePrediction, GlueDataTrainingArguments, + LineByLineTextDataset, SquadDataset, SquadDataTrainingArguments, + TextDataset, + TextDatasetForNextSentencePrediction, ) from .generation_utils import top_k_top_p_filtering from .modeling_albert import ( diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py index d4519aa4a6c1ba..1a377a60b155b0 100644 --- a/src/transformers/data/datasets/language_modeling.py +++ b/src/transformers/data/datasets/language_modeling.py @@ -118,7 +118,11 @@ class TextDatasetForNextSentencePrediction(Dataset): """ def __init__( - self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, + self, + tokenizer: PreTrainedTokenizer, + file_path: str, + block_size: int, + overwrite_cache=False, ): assert os.path.isfile(file_path), f"Input file path {file_path} not found" @@ -126,7 +130,12 @@ def __init__( directory, filename = os.path.split(file_path) cached_features_file = os.path.join( - directory, "cached_nsp_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,), + directory, + "cached_nsp_{}_{}_{}".format( + tokenizer.__class__.__name__, + str(block_size), + filename, + ), ) self.tokenizer = tokenizer diff --git a/tests/test_data_collator.py b/tests/test_data_collator.py index 08f29e36d88150..2ec65e573807d0 100644 --- a/tests/test_data_collator.py +++ b/tests/test_data_collator.py @@ -9,8 +9,8 @@ from transformers import ( DataCollatorForLanguageModeling, - DataCollatorForPermutationLanguageModeling, DataCollatorForNextSentencePrediction, + DataCollatorForPermutationLanguageModeling, GlueDataset, GlueDataTrainingArguments, LineByLineTextDataset,