diff --git a/.coveragerc b/.coveragerc
index 9b8c40ecf153d..e0d5674aa0e9e 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,8 @@
 [run]
 source=pytorch_transformers
+omit =
+    # skip convertion scripts from testing for now
+    */convert_*
 [report]
 exclude_lines =
     pragma: no cover
diff --git a/.gitignore b/.gitignore
index 05129fc40276a..6bbe32df6c875 100644
--- a/.gitignore
+++ b/.gitignore
@@ -126,4 +126,5 @@ models
 proc_data
 
 # examples
+runs
 examples/runs
\ No newline at end of file
diff --git a/examples/run_glue.py b/examples/run_glue.py
index 59583ed712c8e..62d655ecc98fb 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -60,25 +60,14 @@
     'xlm': XLMTokenizer,
 }
 
-def train(args, train_features, model):
+def train(args, train_dataset, model):
     """ Train the model """
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
 
-    # Convert in tensors and build dataloader
-    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-    if args.output_mode == "classification":
-        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
-    elif args.output_mode == "regression":
-        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
-
     args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-    train_sampler = RandomSampler(train_data) if args.local_rank == -1 else DistributedSampler(train_data)
-    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 
     num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
@@ -109,19 +98,24 @@ def train(args, train_features, model):
 
     # Train!
     logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_features))
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Batch size = %d", args.train_batch_size)
-    logger.info("  Num steps = %d", num_train_optimization_steps)
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", num_train_optimization_steps)
 
     global_step = 0
     tr_loss = 0
     model.train()
+    optimizer.zero_grad()
     for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
         for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
             batch = tuple(t.to(args.device) for t in batch)
-            input_ids, input_mask, segment_ids, label_ids = batch
-
-            ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1],
+                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,
+                      'labels':         batch[3]}
+            ouputs = model(**inputs)
             loss = ouputs[0]
 
             if args.n_gpu > 1:
@@ -150,30 +144,20 @@ def train(args, train_features, model):
     return global_step, tr_loss / global_step
 
 
-def evalutate(args, eval_task, eval_output_dir, eval_features, model):
+def evalutate(args, eval_task, eval_output_dir, dataset, model):
     """ Evaluate the model """
     if os.path.exists(eval_output_dir) and os.listdir(eval_output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(eval_output_dir))
     if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(eval_output_dir)
 
-    # Convert in tensors and build dataloader
-    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-    if args.output_mode == "classification":
-        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-    elif args.output_mode == "regression":
-        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
-
-    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
     # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
     # Eval!
     logger.info("***** Running evaluation *****")
-    logger.info("  Num examples = %d", len(eval_features))
+    logger.info("  Num examples = %d", len(dataset))
     logger.info("  Batch size = %d", args.eval_batch_size)
     model.eval()
     eval_loss = 0
@@ -214,36 +198,47 @@ def evalutate(args, eval_task, eval_output_dir, eval_features, model):
             logger.info("  %s = %s", key, str(result[key]))
             writer.write("%s = %s\n" % (key, str(result[key])))
 
+    return result
 
-def load_and_cache_examples(args, task, tokenizer, eval=False):
-    processor = processors[task]()
-    output_mode = output_modes[task]
-    label_list = processor.get_labels()
 
-    # Load and cache data
+def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     processor = processors[task]()
-    examples = processor.get_dev_examples(args.data_dir)
-    cached_features_file = os.path.join(args.data_dir, '{}_{}_{}_{}'.format(
-        'dev' if eval else 'train',
+    output_mode = output_modes[task]
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
         list(filter(None, args.model_name.split('/'))).pop(),
         str(args.max_seq_length),
         str(task)))
-
     if os.path.exists(cached_features_file):
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
-        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        label_list = processor.get_labels()
+        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
         features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
-            cls_token_at_end=bool(args.model_type not in ['bert', 'xlm']),
+            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
             cls_token=tokenizer.cls_token,
-            sep_token=tokenizer.sep_token, cls_token_segment_id=2,
-            pad_on_left=True, pad_token_segment_id=4)
-        if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+            sep_token=tokenizer.sep_token,
+            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1,
+            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
+            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
+        if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
 
-    return features
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+
+    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    return dataset
 
 
 def main():
@@ -350,10 +345,10 @@ def main():
         torch.distributed.barrier()
 
     args.model_type = args.model_name.lower().split('-')[0]
-    args.tokenizer_class = TOKENIZER_CLASSES[args.model_type]
-    args.model_class = MODEL_CLASSES[args.model_type]
-    tokenizer = args.tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)
-    model = args.model_class.from_pretrained(args.model_name, num_labels=num_labels)
+    tokenizer_class = TOKENIZER_CLASSES[args.model_type]
+    model_class = MODEL_CLASSES[args.model_type]
+    tokenizer = tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name, num_labels=num_labels)
 
     if args.local_rank == 0:
         torch.distributed.barrier()
@@ -372,23 +367,30 @@ def main():
 
     # Training
     if args.do_train:
-        train_features = load_and_cache_examples(args, args.task_name, tokenizer, eval=False)
-        global_step, tr_loss = train(args, train_features, model)
+        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
 
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Save a trained model, configuration and tokenizer
+        # Create output directory if needed
+        if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+            raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
         model.save_pretrained(args.output_dir)
-        tokenizer.save_vocabulary(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
         torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
 
         # Load a trained model and vocabulary that you have fine-tuned
-        model = args.model_class.from_pretrained(args.output_dir)
-        tokenizer = args.tokenizer_class.from_pretrained(args.output_dir)
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
     # Evaluation
@@ -398,9 +400,11 @@ def main():
         eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
 
         for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
-            eval_features = load_and_cache_examples(args, eval_task, tokenizer, eval=True)
+            eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
+
+            result = evalutate(args, eval_task, eval_output_dir, eval_dataset, model)
 
-            evalutate(args, eval_task, eval_output_dir, eval_features, model)
+        return result
 
 
 if __name__ == "__main__":
diff --git a/examples/test_examples.py b/examples/test_examples.py
index fada43dae2c7c..8284858a121c3 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -19,6 +19,7 @@
 import sys
 import unittest
 import argparse
+import logging
 
 try:
     # python 3.4+ can use builtin unittest.mock instead of mock package
@@ -26,7 +27,11 @@
 except ImportError:
     from mock import patch
 
-import run_bert_squad as rbs
+import run_glue
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
 
 def get_setup_file():
     parser = argparse.ArgumentParser()
@@ -36,12 +41,18 @@ def get_setup_file():
 
 class ExamplesTests(unittest.TestCase):
 
-    def test_run_squad(self):
-        testargs = ["prog", "-f", "/home/test/setup.py"]
-        with patch.object(sys, 'argv', testargs):
-            setup = get_setup_file()
-            assert setup == "/home/test/setup.py"
-            # rbs.main()
+    def test_run_glue(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        testargs = ["run_glue.py", "--data_dir=./examples/tests_samples/MRPC/",
+                    "--task_name=mrpc", "--do_train", "--do_eval", "--output_dir=./examples/tests_samples/temp_dir",
+                    "--train_batch_size=4", "--eval_batch_size=2", "--num_train_epochs=2.0", "--overwrite_output_dir"]
+        model_name = "--model_name=xlnet-large-cased"
+        with patch.object(sys, 'argv', testargs + [model_name]):
+            result = run_glue.main()
+            for value in result.values():
+                self.assertGreaterEqual(value, 0.75)
 
 
 if __name__ == "__main__":
diff --git a/examples/tests_samples/.gitignore b/examples/tests_samples/.gitignore
new file mode 100644
index 0000000000000..1ac7520522849
--- /dev/null
+++ b/examples/tests_samples/.gitignore
@@ -0,0 +1,5 @@
+*.*
+cache*
+temp*
+!*.tsv
+!.gitignore
\ No newline at end of file
diff --git a/examples/tests_samples/MRPC/dev.tsv b/examples/tests_samples/MRPC/dev.tsv
new file mode 100644
index 0000000000000..5b814856c63f4
--- /dev/null
+++ b/examples/tests_samples/MRPC/dev.tsv
@@ -0,0 +1,7 @@
+﻿Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
diff --git a/examples/tests_samples/MRPC/train.tsv b/examples/tests_samples/MRPC/train.tsv
new file mode 100644
index 0000000000000..5b814856c63f4
--- /dev/null
+++ b/examples/tests_samples/MRPC/train.tsv
@@ -0,0 +1,7 @@
+﻿Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 0dd72b2969672..ea9502d2efd32 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -28,7 +28,6 @@
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .file_utils import cached_path
 from .modeling_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer
 
 logger = logging.getLogger(__name__)
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 9340ce84895ea..7fefbefeaef28 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -30,7 +30,6 @@
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .file_utils import cached_path
 from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
                           PreTrainedModel, prune_conv1d_layer, SequenceSummary)
 from .modeling_bert import BertLayerNorm as LayerNorm
@@ -122,9 +121,8 @@ def __init__(
         predict_special_tokens=True,
         summary_type='token_ids',
         summary_use_proj=True,
-        summary_num_classes=1,
         summary_activation=None,
-        summary_dropout=0.1,
+        summary_first_dropout=0.1,
         **kwargs
     ):
         """Constructs GPT2Config.
@@ -172,9 +170,8 @@ def __init__(
             self.predict_special_tokens = predict_special_tokens
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
-            self.summary_num_classes = summary_num_classes
             self.summary_activation = summary_activation
-            self.summary_dropout = summary_dropout
+            self.summary_first_dropout = summary_first_dropout
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 4a3ff732f6c46..c99df420356e8 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -30,9 +30,8 @@
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .file_utils import cached_path
 from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
-                          PreTrainedModel, prune_conv1d_layer, SequenceSummary)
+                             PreTrainedModel, prune_conv1d_layer, SequenceSummary)
 from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
@@ -150,9 +149,8 @@ def __init__(
         predict_special_tokens=True,
         summary_type='token_ids',
         summary_use_proj=True,
-        summary_num_classes=1,
         summary_activation=None,
-        summary_dropout=0.1,
+        summary_first_dropout=0.1,
         **kwargs
     ):
         """Constructs OpenAIGPTConfig.
@@ -203,9 +201,8 @@ def __init__(
             self.predict_special_tokens = predict_special_tokens
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
-            self.summary_num_classes = summary_num_classes
             self.summary_activation = summary_activation
-            self.summary_dropout = summary_dropout
+            self.summary_first_dropout = summary_first_dropout
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 35a1b635f9127..0c5d127d62d26 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -36,7 +36,6 @@
 
 from .modeling_bert import BertLayerNorm as LayerNorm
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
-from .file_utils import cached_path
 from .modeling_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
 
 logger = logging.getLogger(__name__)
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index b9be1a3813323..36b506da3b3a3 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -25,7 +25,7 @@
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss, functional as F
+from torch.nn import CrossEntropyLoss, functional as F
 
 from .file_utils import cached_path
 
@@ -514,10 +514,10 @@ class SequenceSummary(nn.Module):
                 - 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2)
                 - 'attn' => Not implemented now, use multi-head attention
             summary_use_proj: Add a projection after the vector extraction
-            summary_num_classes: If > 0: the projection outputs to n classes (otherwise to hidden_size)
-            summary_activation:
-                'tanh' => add a tanh activation to the output
-                    None => no activation
+            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default 
+            summary_first_dropout: Add a dropout before the projection and activation
+            summary_last_dropout: Add a dropout after the projection and activation
     """
     def __init__(self, config):
         super(SequenceSummary, self).__init__()
@@ -531,8 +531,8 @@ def __init__(self, config):
 
         self.summary = nn.Identity()
         if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
-            if hasattr(config, 'summary_num_classes') and config.summary_num_classes > 0:
-                num_classes = config.summary_num_classes
+            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
             else:
                 num_classes = config.hidden_size
             self.summary = nn.Linear(config.hidden_size, num_classes)
@@ -541,7 +541,13 @@ def __init__(self, config):
         if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
             self.activation = nn.Tanh()
 
-        self.dropout = nn.Dropout(config.summary_dropout)
+        self.first_dropout = nn.Identity()
+        if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
 
     def forward(self, hidden_states, token_ids=None):
         """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
@@ -567,9 +573,10 @@ def forward(self, hidden_states, token_ids=None):
         elif self.summary_type == 'attn':
             raise NotImplementedError
 
+        output = self.first_dropout(output)
         output = self.summary(output)
         output = self.activation(output)
-        output = self.dropout(output)
+        output = self.last_dropout(output)
 
         return output
 
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index c7ea294dbd419..65db9e7159134 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -14,18 +14,14 @@
 # limitations under the License.
 """ PyTorch XLM model.
 """
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import json
 import logging
 import math
-import os
 import sys
 from io import open
 
-import math
 import itertools
 import numpy as np
 
@@ -34,9 +30,8 @@
 from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .file_utils import cached_path
-from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
-                          prune_linear_layer, SequenceSummary, SQuADHead)
+from .modeling_utils import (PretrainedConfig, PreTrainedModel,
+                             prune_linear_layer, SequenceSummary, SQuADHead)
 
 logger = logging.getLogger(__name__)
 
@@ -79,10 +74,11 @@ def __init__(self,
 
                  finetuning_task=None,
                  num_labels=2,
-                 summary_type='last',
+                 summary_type='first',
                  summary_use_proj=True,
-                 summary_activation='tanh',
-                 summary_dropout=0.1,
+                 summary_activation=None,
+                 summary_proj_to_labels=True,
+                 summary_first_dropout=0.1,
                  start_n_top=5,
                  end_n_top=5,
                  **kwargs):
@@ -164,7 +160,8 @@ def __init__(self,
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
             self.summary_activation = summary_activation
-            self.summary_dropout = summary_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
+            self.summary_first_dropout = summary_first_dropout
             self.start_n_top = start_n_top
             self.end_n_top = end_n_top
         else:
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 628dbe7450863..e0b3fb066164c 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -31,9 +31,8 @@
 from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .file_utils import cached_path
 from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
-                          SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits)
+                             SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits)
 
 
 logger = logging.getLogger(__name__)
@@ -227,7 +226,7 @@ def __init__(self,
                  summary_type='last',
                  summary_use_proj=True,
                  summary_activation='tanh',
-                 summary_dropout=0.1,
+                 summary_last_dropout=0.1,
                  start_n_top=5,
                  end_n_top=5,
                  **kwargs):
@@ -314,7 +313,7 @@ def __init__(self,
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
             self.summary_activation = summary_activation
-            self.summary_dropout = summary_dropout
+            self.summary_last_dropout = summary_last_dropout
             self.start_n_top = start_n_top
             self.end_n_top = end_n_top
         else:
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 3e14673f4645c..1235d6f3cf82b 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -113,8 +113,6 @@ def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never
             raise ValueError(
                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                 "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
-        if never_split is None:
-            never_split = self.all_special_tokens
         self.vocab = load_vocab(vocab_file)
         self.ids_to_tokens = collections.OrderedDict(
             [(ids, tok) for tok, ids in self.vocab.items()])
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 60081893c8f67..a84b8d6f44c03 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -142,11 +142,7 @@ def __init__(self, max_len=None, **kwargs):
         self.added_tokens_decoder = {}
 
         for key, value in kwargs.items():
-            if key not in self.SPECIAL_TOKENS_ATTRIBUTES:
-                raise ValueError(
-                    "PreTrainedTokenizer.__init__() argument {} should be in {}".format(
-                        key, ', '.join(self.SPECIAL_TOKENS_ATTRIBUTES)))
-            else:
+            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                 setattr(self, key, value)
 
 
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 8a11a84f8c69c..885145582941f 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -20,13 +20,9 @@
 import logging
 import os
 import re
-import sys
 from io import open
 
-from tqdm import tqdm
-
-from .file_utils import cached_path
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_bert import BasicTokenizer
 
 logger = logging.getLogger(__name__)