diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py index 05eda6010cc3ff..4a39d130c36b92 100644 --- a/src/transformers/configuration_xlnet.py +++ b/src/transformers/configuration_xlnet.py @@ -15,8 +15,6 @@ # limitations under the License. """ XLNet configuration """ -import warnings - from .configuration_utils import PretrainedConfig from .utils import logging @@ -144,7 +142,7 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-12, dropout=0.1, - mem_len=None, + mem_len=512, reuse_len=None, bi_data=False, clamp_len=-1, @@ -198,17 +196,6 @@ def __init__( self.pad_token_id = pad_token_id self.eos_token_id = eos_token_id - if mem_len is None or mem_len == 0: - warnings.warn( - "This config doesn't use attention memories, a core feature of XLNet." - " Consider setting `mem_len` to a non-zero value, for example " - "`xlnet = XLNetLMHeadModel.from_pretrained('xlnet-base-cased'', mem_len=1024)`," - " for accurate training performance as well as an order of magnitude faster inference." - " Starting from version 3.5.0, the default parameter will be 1024, following" - " the implementation in https://arxiv.org/abs/1906.08237", - FutureWarning, - ) - @property def max_position_embeddings(self): return -1 diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index fd3113fa263ffe..6405cd13532fb0 100755 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -16,8 +16,6 @@ """ PyTorch XLNet model. """ - - from dataclasses import dataclass from typing import List, Optional, Tuple @@ -1087,6 +1085,7 @@ def forward( output_hidden_states=None, return_dict=None, ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states