============================================================ test session starts ============================================================ platform linux -- Python 3.8.10, pytest-7.2.0, pluggy-1.0.0 rootdir: /home/adit299/optimum, configfile: setup.cfg plugins: xdist-3.1.0 collected 55 items tests/bettertransformer/test_bettertransformer_encoder.py ssss...F............................................... [100%] ================================================================= FAILURES ================================================================== _________________________________________________ BetterTransformersEncoderTest.test_logits _________________________________________________ self = models_to_test = ['hf-internal-testing/tiny-random-AlbertModel', 'hf-internal-testing/tiny-random-BertModel', 'hf-internal-testing/tiny...2VecTextModel', 'hf-internal-testing/tiny-random-DistilBertModel', 'hf-internal-testing/tiny-random-ElectraModel', ...] preprocessor_kwargs = {}, model_id = 'hf-internal-testing/tiny-random-ProphetNetModel' inputs = {'attention_mask': tensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0]]), 'input_ids': tensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]])} hf_random_model = ProphetNetModel( (word_embeddings): Embedding(30522, 16, padding_idx=0) (encoder): ProphetNetEncoder( (word_em...wise_affine=True) ) ) (embeddings_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) ) ) random_config = ProphetNetConfig { "_name_or_path": "hf-internal-testing/tiny-random-ProphetNetModel", "activation_dropout": 0.1, ...ce": 128, "torch_dtype": "float32", "transformers_version": "4.25.1", "use_cache": true, "vocab_size": 30522 } converted_model = ProphetNetModel( (word_embeddings): Embedding(30522, 16, padding_idx=0) (encoder): ProphetNetEncoder( (word_em...wise_affine=True) ) ) (embeddings_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) ) ) hf_hidden_states = tensor([[[-1.1544, 1.5824, 0.2771, 1.1016, -1.8081, -0.2683, -2.3076, 1.2889, -1.8794, 0.5597, 0.0750,... 0.5560, 1.2087, 0.1141, -0.6267, 0.2556, -1.1783, 0.9775, 0.3378, -0.4853, 0.1481, -0.5173]]]) bt_hidden_states = tensor([[[-1.1544, 1.5824, 0.2771, 1.1016, -1.8081, -0.2683, -2.3076, 1.2889, -1.8794, 0.5597, 0.0750,... 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]) def test_logits(self, models_to_test: Optional[List] = None, **preprocessor_kwargs): r""" This tests if the converted model produces the same logits than the original model. """ # The first row of the attention mask needs to be all ones -> check: https://github.com/pytorch/pytorch/blob/19171a21ee8a9cc1a811ac46d3abd975f0b6fc3b/test/test_nn.py#L5283 if models_to_test is None: models_to_test = self.all_models_to_test for model_id in models_to_test: inputs = self.prepare_inputs_for_class(model_id=model_id, **preprocessor_kwargs) torch.manual_seed(0) hf_random_model = AutoModel.from_pretrained(model_id).eval() random_config = hf_random_model.config torch.manual_seed(0) converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=True) self.assertFalse( hasattr(hf_random_model, "use_bettertransformer"), f"The model {hf_random_model.__class__.__name__} has been converted to a `fast` model by mistake.", ) with torch.no_grad(): r""" Make sure the models are in eval mode! Make also sure that the original model has not been converted to a fast model. The check is done above. """ torch.manual_seed(0) > hf_hidden_states = hf_random_model(**inputs)[0] tests/bettertransformer/testing_bettertransformer_utils.py:73: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ .env/lib/python3.8/site-packages/torch/nn/modules/module.py:1194: in _call_impl return forward_call(*input, **kwargs) .env/lib/python3.8/site-packages/transformers/models/prophetnet/modeling_prophetnet.py:1865: in forward decoder_outputs = self.decoder( .env/lib/python3.8/site-packages/torch/nn/modules/module.py:1194: in _call_impl return forward_call(*input, **kwargs) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = ProphetNetDecoder( (word_embeddings): Embedding(30522, 16, padding_idx=0) (position_embeddings): ProphetNetPositio...5, elementwise_affine=True) ) ) (embeddings_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True) ) input_ids = None, attention_mask = None encoder_hidden_states = tensor([[[-0.1677, 1.3368, -1.4649, -0.1629, 0.6566, 1.0974, -1.1565, 1.7724, 1.1156, -0.2349, -0.3339,....7326, 1.1123, 0.8817, -1.4288, -0.0795, 0.9194, 1.3962, 1.1239, -1.3174, -0.9091, -0.5158]]]) encoder_attention_mask = tensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0]]), head_mask = None, cross_attn_head_mask = None past_key_values = None, inputs_embeds = None, use_cache = True, output_attentions = False, output_hidden_states = False, return_dict = True @add_start_docstrings_to_model_forward(PROPHETNET_STANDALONE_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=ProphetNetDecoderModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, cross_attn_head_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, inputs_embeds: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, ProphetNetDecoderModelOutput]: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. Returns: Example: ```python >>> from transformers import ProphetNetTokenizer, ProphetNetDecoder >>> import torch >>> tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased") >>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) >>> last_hidden_states = outputs.last_hidden_state ```""" use_cache = use_cache if use_cache is not None else self.config.use_cache output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is None and inputs_embeds is None: > raise ValueError("Either `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.") E ValueError: Either `decoder_input_ids` or `decoder_inputs_embeds` has to be passed. .env/lib/python3.8/site-packages/transformers/models/prophetnet/modeling_prophetnet.py:1504: ValueError ============================================================= warnings summary ============================================================== tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderTest::test_inference_speed /home/adit299/optimum/.env/src/optimum/optimum/bettertransformer/models/encoder_models.py:207: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at ../aten/src/ATen/NestedTensorImpl.cpp:175.) hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask) tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderTest::test_logits /home/adit299/optimum/.env/src/optimum/optimum/utils/testing_utils.py:22: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3, and in 3.10 it will stop working if isinstance(v, collections.MutableMapping): tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderTest::test_raise_autocast /home/adit299/optimum/.env/lib/python3.8/site-packages/transformers/models/tapas/modeling_tapas.py:1788: UserWarning: scatter_reduce() is in beta and the API may change at any time. (Triggered internally at ../aten/src/ATen/native/TensorAdvancedIndexing.cpp:1615.) segment_means = out.scatter_reduce( tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderDecoderTest::test_logits_1_hf_internal_testing_tiny_random_bart_True tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderDecoderTest::test_logits_3_hf_internal_testing_tiny_random_FSMTModel_True tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderDecoderTest::test_logits_5_hf_internal_testing_tiny_random_mbart_True tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderDecoderTest::test_logits_7_hf_internal_testing_tiny_random_nllb_True /home/adit299/optimum/.env/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2354: UserWarning: `max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`. warnings.warn( -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html ========================================================== short test summary info ========================================================== FAILED tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderTest::test_logits - ValueError: Either `decoder_input_ids` or `decoder_inputs_embeds` has to be passed. ====================================== 1 failed, 50 passed, 4 skipped, 7 warnings in 349.05s (0:05:49) ======================================