huggingface · sgugger · Aug 31, 2020 · Aug 28, 2020 · Aug 28, 2020 · Aug 28, 2020
diff --git a/examples/lightning_base.py b/examples/lightning_base.py
@@ -178,10 +178,10 @@ def train_dataloader(self):
         return self.train_loader
 
     def val_dataloader(self):
-        return self.get_dataloader("dev", self.hparams.eval_batch_size)
+        return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)
 
     def test_dataloader(self):
-        return self.get_dataloader("test", self.hparams.eval_batch_size)
+        return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)
 
     def _feature_file(self, mode):
         return os.path.join(

diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md
@@ -6,8 +6,9 @@ Please tag @sshleifer with any issues/unexpected behaviors, or send a PR!
 For `bertabs` instructions, see [`bertabs/README.md`](bertabs/README.md).
 
 
-### Data
-XSUM Data:
+## Datasets
+
+#### XSUM:
 ```bash
 cd examples/seq2seq
 wget https://s3.amazonaws.com/datasets.huggingface.co/summarization/xsum.tar.gz
@@ -17,23 +18,33 @@ export XSUM_DIR=${PWD}/xsum
 this should make a directory called `xsum/` with files like `test.source`.
 To use your own data, copy that files format. Each article to be summarized is on its own line.
 
-CNN/DailyMail data
+#### CNN/DailyMail
 ```bash
 cd examples/seq2seq
-wget https://s3.amazonaws.com/datasets.huggingface.co/summarization/cnn_dm.tgz
-tar -xzvf cnn_dm.tgz
+wget https://s3.amazonaws.com/datasets.huggingface.co/summarization/cnn_dm_v2.tgz
+tar -xzvf cnn_dm_v2.tgz  # empty lines removed
+mv cnn_cln cnn_dm
 export CNN_DIR=${PWD}/cnn_dm
-this should make a directory called `cnn_dm/` with files like `test.source`.
 ```
+this should make a directory called `cnn_dm/` with 6 files.
 
-WMT16 English-Romanian Translation Data:
+#### WMT16 English-Romanian Translation Data:
 download with this command:
 ```bash
 wget https://s3.amazonaws.com/datasets.huggingface.co/translation/wmt_en_ro.tar.gz
 tar -xzvf wmt_en_ro.tar.gz
 export ENRO_DIR=${PWD}/wmt_en_ro
-this should make a directory called `wmt_en_ro/` with files like `test.source`.
 ```
+this should make a directory called `wmt_en_ro/` with 6 files.
+
+#### WMT English-German:
+```bash
+wget https://s3.amazonaws.com/datasets.huggingface.co/translation/wmt_en_de.tgz
+tar -xzvf wmt_en_de.tar.gz
+export DATA_DIR=${PWD}/wmt_en_de
+```
+
+#### Private Data
 
 If you are using your own data, it must be formatted as one directory with 6 files: 
 ```
@@ -75,7 +86,8 @@ Datasets: `LegacySeq2SeqDataset` will be used for all tokenizers without a `prep
 Future work/help wanted: A new dataset to support multilingual tasks.
 
 
-### Command Line Options
+### Finetuning Scripts
+All finetuning bash scripts call finetune.py (or distillation.py) with reasonable command line arguments. They usually require extra command line arguments to work.
 
 To see all the possible command line options, run:
 
@@ -110,6 +122,8 @@ The following command should work on a 16GB GPU:
     --model_name_or_path facebook/bart-large
 ```
 
+There is a starter finetuning script for pegasus at `finetune_pegasus_xsum.sh`.
+
 ### Translation Finetuning
 
 First, follow the wmt_en_ro download instructions.

diff --git a/examples/seq2seq/finetune_pegasus_xsum.sh b/examples/seq2seq/finetune_pegasus_xsum.sh
@@ -10,5 +10,5 @@ python finetune.py \
     --n_val 1000 \
     --val_check_interval 0.25 \
     --max_source_length 512 --max_target_length 56 \
-    --freeze_embeds --max_target_length 56 --label_smoothing 0.1 \
+    --freeze_embeds --label_smoothing 0.1 --adafactor --task summarization_xsum \
     "$@"
diff --git a/examples/seq2seq/run_eval.py b/examples/seq2seq/run_eval.py
@@ -67,7 +67,7 @@ def generate_summaries_or_translations(
             fout.write(hypothesis + "\n")
             fout.flush()
     fout.close()
-    runtime = time.time() - start_time
+    runtime = int(time.time() - start_time)  # seconds
     n_obs = len(examples)
     return dict(n_obs=n_obs, runtime=runtime, seconds_per_sample=round(runtime / n_obs, 4))
 

diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py
@@ -13,9 +13,10 @@
 from torch.utils.data import DataLoader
 
 import lightning_base
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer
+from transformers.hf_api import HfApi
 from transformers.modeling_bart import shift_tokens_right
-from transformers.testing_utils import CaptureStderr, CaptureStdout, require_multigpu
+from transformers.testing_utils import CaptureStderr, CaptureStdout, require_multigpu, require_torch_and_cuda, slow
 
 from .distillation import distill_main, evaluate_checkpoint
 from .finetune import SummarizationModule, main
@@ -116,6 +117,25 @@ def setUpClass(cls):
         logging.disable(logging.CRITICAL)  # remove noisy download output from tracebacks
         return cls
 
+    @slow
+    @require_torch_and_cuda
+    def test_hub_configs(self):
+        """I put require_torch_and_cuda cause I only want this to run with self-scheduled."""
+
+        model_list = HfApi().model_list()
+        org = "sshleifer"
+        model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
+        allowed_to_be_broken = ["sshleifer/blenderbot-3B", "sshleifer/blenderbot-90M"]
+        failures = []
+        for m in model_ids:
+            if m in allowed_to_be_broken:
+                continue
+            try:
+                AutoConfig.from_pretrained(m)
+            except Exception:
+                failures.append(m)
+        assert not failures, f"The following models could not be loaded through AutoConfig: {failures}"
+
     @require_multigpu
     def test_multigpu(self):
         updates = dict(

diff --git a/model_cards/rdenadai/BR_BERTo/README.md b/model_cards/rdenadai/BR_BERTo/README.md
@@ -14,13 +14,17 @@ Portuguese (Brazil) model for text inference.
 
 ## Params
 
-Trained on a corpus of 5_258_624 sentences, with 132_807_374 non unique tokens (992_418 unique tokens).
+Trained on a corpus of 6_993_330 sentences.
 
-- Vocab size: 220_000
-- RobertaForMaskedLM  size : 32
-- Num train epochs: 2
-- Time to train: ~23hs (on GCP with a Nvidia T4)
+- Vocab size: 150_000
+- RobertaForMaskedLM  size : 512
+- Num train epochs: 3
+- Time to train: ~10days (on GCP with a Nvidia T4)
 
 I follow the great tutorial from HuggingFace team:
 
 [How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train)
+
+More infor here:
+
+[BR_BERTo](https://github.com/rdenadai/BR-BERTo)
diff --git a/model_cards/zanelim/singbert-large-sg/README.md b/model_cards/zanelim/singbert-large-sg/README.md
@@ -13,17 +13,17 @@ datasets:
 - reddit singapore, malaysia
 - hardwarezone
 widget:
-- text: "die [MASK] must try"
 - text: "kopi c siew [MASK]"
+- text: "die [MASK] must try"
 ---
 
 # Model name
 
-SingBert - Bert for Singlish (SG) and Manglish (MY).
+SingBert Large - Bert for Singlish (SG) and Manglish (MY).
 
 ## Model description
 
-Similar to [SingBert](https://huggingface.co/zanelim/singbert) but initialized from [BERT large uncased (whole word masking)](https://github.com/google-research/bert#pre-trained-models), with pre-training finetuned on
+Similar to [SingBert](https://huggingface.co/zanelim/singbert) but the large version, which was initialized from [BERT large uncased (whole word masking)](https://github.com/google-research/bert#pre-trained-models), with pre-training finetuned on
 [singlish](https://en.wikipedia.org/wiki/Singlish) and [manglish](https://en.wikipedia.org/wiki/Manglish) data.
 
 ## Intended uses & limitations

diff --git a/model_cards/zanelim/singbert-lite-sg/README.md b/model_cards/zanelim/singbert-lite-sg/README.md
@@ -0,0 +1,168 @@
+---
+language: en
+tags:
+- singapore
+- sg
+- singlish
+- malaysia
+- ms
+- manglish
+- albert-base-v2
+license: mit
+datasets:
+- reddit singapore, malaysia
+- hardwarezone
+widget:
+- text: "dont play [MASK] leh"
+- text: "die [MASK] must try"
+---
+
+# Model name
+
+SingBert Lite - Bert for Singlish (SG) and Manglish (MY).
+
+## Model description
+
+Similar to [SingBert](https://huggingface.co/zanelim/singbert) but the lite-version, which was initialized from [Albert base v2](https://github.com/google-research/albert#albert), with pre-training finetuned on
+[singlish](https://en.wikipedia.org/wiki/Singlish) and [manglish](https://en.wikipedia.org/wiki/Manglish) data.
+
+## Intended uses & limitations
+
+#### How to use
+
+```python
+>>> from transformers import pipeline
+>>> nlp = pipeline('fill-mask', model='zanelim/singbert-lite-sg')
+>>> nlp("die [MASK] must try")
+
+[{'sequence': '[CLS] die die must try[SEP]',
+  'score': 0.7731555700302124,
+  'token': 1327,
+  'token_str': '▁die'},
+ {'sequence': '[CLS] die also must try[SEP]',
+  'score': 0.04763784259557724,
+  'token': 67,
+  'token_str': '▁also'},
+ {'sequence': '[CLS] die still must try[SEP]',
+  'score': 0.01859409362077713,
+  'token': 174,
+  'token_str': '▁still'},
+ {'sequence': '[CLS] die u must try[SEP]',
+  'score': 0.015824034810066223,
+  'token': 287,
+  'token_str': '▁u'},
+ {'sequence': '[CLS] die is must try[SEP]',
+  'score': 0.011271446943283081,
+  'token': 25,
+  'token_str': '▁is'}]
+
+>>> nlp("dont play [MASK] leh")
+
+[{'sequence': '[CLS] dont play play leh[SEP]',
+  'score': 0.4365769624710083,
+  'token': 418,
+  'token_str': '▁play'},
+ {'sequence': '[CLS] dont play punk leh[SEP]',
+  'score': 0.06880936771631241,
+  'token': 6769,
+  'token_str': '▁punk'},
+ {'sequence': '[CLS] dont play game leh[SEP]',
+  'score': 0.051739856600761414,
+  'token': 250,
+  'token_str': '▁game'},
+ {'sequence': '[CLS] dont play games leh[SEP]',
+  'score': 0.045703962445259094,
+  'token': 466,
+  'token_str': '▁games'},
+ {'sequence': '[CLS] dont play around leh[SEP]',
+  'score': 0.013458190485835075,
+  'token': 140,
+  'token_str': '▁around'}]
+
+>>> nlp("catch no [MASK]")
+
+[{'sequence': '[CLS] catch no ball[SEP]',
+  'score': 0.6197211146354675,
+  'token': 1592,
+  'token_str': '▁ball'},
+ {'sequence': '[CLS] catch no balls[SEP]',
+  'score': 0.08441998809576035,
+  'token': 7152,
+  'token_str': '▁balls'},
+ {'sequence': '[CLS] catch no joke[SEP]',
+  'score': 0.0676785409450531,
+  'token': 8186,
+  'token_str': '▁joke'},
+ {'sequence': '[CLS] catch no?[SEP]',
+  'score': 0.040638409554958344,
+  'token': 60,
+  'token_str': '?'},
+ {'sequence': '[CLS] catch no one[SEP]',
+  'score': 0.03546864539384842,
+  'token': 53,
+  'token_str': '▁one'}]
+
+>>> nlp("confirm plus [MASK]")
+
+[{'sequence': '[CLS] confirm plus chop[SEP]',
+  'score': 0.9608421921730042,
+  'token': 17144,
+  'token_str': '▁chop'},
+ {'sequence': '[CLS] confirm plus guarantee[SEP]',
+  'score': 0.011784233152866364,
+  'token': 9120,
+  'token_str': '▁guarantee'},
+ {'sequence': '[CLS] confirm plus confirm[SEP]',
+  'score': 0.010571340098977089,
+  'token': 10265,
+  'token_str': '▁confirm'},
+ {'sequence': '[CLS] confirm plus egg[SEP]',
+  'score': 0.0033525123726576567,
+  'token': 6387,
+  'token_str': '▁egg'},
+ {'sequence': '[CLS] confirm plus bet[SEP]',
+  'score': 0.0008760977652855217,
+  'token': 5676,
+  'token_str': '▁bet'}]
+
+```
+
+Here is how to use this model to get the features of a given text in PyTorch:
+```python
+from transformers import AlbertTokenizer, AlbertModel
+tokenizer = AlbertTokenizer.from_pretrained('zanelim/singbert-lite-sg')
+model = AlbertModel.from_pretrained("zanelim/singbert-lite-sg")
+text = "Replace me by any text you'd like."
+encoded_input = tokenizer(text, return_tensors='pt')
+output = model(**encoded_input)
+```
+
+and in TensorFlow:
+```python
+from transformers import AlbertTokenizer, TFAlbertModel
+tokenizer = AlbertTokenizer.from_pretrained("zanelim/singbert-lite-sg")
+model = TFAlbertModel.from_pretrained("zanelim/singbert-lite-sg")
+text = "Replace me by any text you'd like."
+encoded_input = tokenizer(text, return_tensors='tf')
+output = model(encoded_input)
+```
+
+#### Limitations and bias
+This model was finetuned on colloquial Singlish and Manglish corpus, hence it is best applied on downstream tasks involving the main
+constituent languages- english, mandarin, malay. Also, as the training data is mainly from forums, beware of existing inherent bias.
+
+## Training data
+Colloquial singlish and manglish (both are a mixture of English, Mandarin, Tamil, Malay, and other local dialects like Hokkien, Cantonese or Teochew)
+corpus. The corpus is collected from subreddits- `r/singapore` and `r/malaysia`, and forums such as `hardwarezone`.
+
+## Training procedure
+
+Initialized with [albert base v2](https://github.com/google-research/albert#albert) vocab and checkpoints (pre-trained weights).
+
+Pre-training was further finetuned on training data with the following hyperparameters
+* train_batch_size: 4096
+* max_seq_length: 128
+* num_train_steps: 125000
+* num_warmup_steps: 5000
+* learning_rate: 0.00176
+* hardware: TPU v3-8
diff --git a/model_cards/zanelim/singbert/README.md b/model_cards/zanelim/singbert/README.md
@@ -13,8 +13,8 @@ datasets:
 - reddit singapore, malaysia
 - hardwarezone
 widget:
-- text: "die [MASK] must try"
 - text: "kopi c siew [MASK]"
+- text: "die [MASK] must try"
 ---
 
 # Model name

diff --git a/notebooks/03-pipelines.ipynb b/notebooks/03-pipelines.ipynb
@@ -2358,7 +2358,7 @@
         "colab_type": "text"
       },
       "source": [
-        "<a href=\"https://colab.research.google.com/github/huggingface/transformers/blob/generation_pipeline_docs/notebooks/03-pipelines.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+        "<a href=\"https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/03-pipelines.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
       ]
     },
     {
@@ -3402,4 +3402,4 @@
       ]
     }
   ]
-}
+}
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -200,6 +200,7 @@
     from .data.data_collator import (
         DataCollator,
         DataCollatorForLanguageModeling,
+        DataCollatorForNextSentencePrediction,
         DataCollatorForPermutationLanguageModeling,
         DataCollatorWithPadding,
         default_data_collator,
@@ -211,6 +212,7 @@
         SquadDataset,
         SquadDataTrainingArguments,
         TextDataset,
+        TextDatasetForNextSentencePrediction,
     )
     from .generation_utils import top_k_top_p_filtering
     from .modeling_albert import (