============================================================ test session starts ============================================================ platform linux -- Python 3.8.10, pytest-7.2.0, pluggy-1.0.0 rootdir: /home/adit299/optimum, configfile: setup.cfg plugins: xdist-3.1.0 collected 57 items / 46 deselected / 11 selected tests/bettertransformer/test_bettertransformer_encoder.py .........FF [100%] ================================================================= FAILURES ================================================================== _______________ BetterTransformersEncoderDecoderTest.test_logits_8_hf_internal_testing_tiny_random_ProphetNetModel_max_length _______________ a = (,) @wraps(func) def standalone_func(*a): > return func(*(a + p.args), **p.kwargs) .env/lib/python3.8/site-packages/parameterized/parameterized.py:533: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ tests/bettertransformer/test_bettertransformer_encoder.py:291: in test_logits super().test_logits([model_id], padding=padding, max_length=max_length) tests/bettertransformer/testing_bettertransformer_utils.py:76: in test_logits bt_hidden_states = converted_model(**inputs)[0] .env/lib/python3.8/site-packages/torch/nn/modules/module.py:1194: in _call_impl return forward_call(*input, **kwargs) .env/lib/python3.8/site-packages/transformers/models/prophetnet/modeling_prophetnet.py:1872: in forward decoder_outputs = self.decoder( .env/lib/python3.8/site-packages/torch/nn/modules/module.py:1194: in _call_impl return forward_call(*input, **kwargs) .env/lib/python3.8/site-packages/transformers/models/prophetnet/modeling_prophetnet.py:1629: in forward layer_outputs = decoder_layer( .env/lib/python3.8/site-packages/torch/nn/modules/module.py:1194: in _call_impl return forward_call(*input, **kwargs) .env/lib/python3.8/site-packages/transformers/models/prophetnet/modeling_prophetnet.py:1229: in forward attention_output, cross_attn_weights, cross_attn_present_key_value = self.cross_attn( .env/lib/python3.8/site-packages/torch/nn/modules/module.py:1194: in _call_impl return forward_call(*input, **kwargs) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = ProphetNetAttention( (key_proj): Linear(in_features=16, out_features=16, bias=True) (value_proj): Linear(in_featur...: Linear(in_features=16, out_features=16, bias=True) (out_proj): Linear(in_features=16, out_features=16, bias=True) ) hidden_states = tensor([[[ 0.5384, 1.7635, 0.0502, ..., -0.8135, -1.1405, 1.5431], [ 0.7601, -2.5271, -0.2412, ..., 0.9...18, 1.0402, ..., 0.0577, 0.6506, -2.2772], [-1.7727, -0.0801, 1.0287, ..., 0.0473, 0.8288, -0.4027]]]) key_value_states = tensor([[[ 0.2246, 1.5394, -0.5484, -0.0670, -0.7055, 2.2053, -0.5040, 0.3666, 0.6893, -0.8401, -1.5722,....0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]) attention_mask = tensor([[[-0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028...8, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38]]]) layer_head_mask = None past_key_value = (tensor([[[[-0.0483, -0.0920, 0.0146, 0.0377], [ 0.1494, -0.0396, 0.0556, -0.0984], [ 0.0745, ... -0.0719, -0.0365], [ 0.1874, -0.0357, -0.0613, -0.0665], [ 0.0000, 0.0000, 0.0000, 0.0000]]]])) output_attentions = False def forward( self, hidden_states, key_value_states: Optional[Tensor] = None, attention_mask: Optional[Tensor] = None, layer_head_mask: Optional[Tensor] = None, past_key_value: Optional[Tuple[Tensor]] = None, output_attentions: bool = False, ) -> Tuple[Tensor, Optional[Tensor]]: batch_size, tgt_len, hidden_size = hidden_states.size() # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None assert list(hidden_states.size()) == [ batch_size, tgt_len, hidden_size, ], f"Size of hidden states should be {batch_size, tgt_len, hidden_size}, but is {hidden_states.size()}" # previous time steps are cached - no need to recompute key and value if they are static query_states = self.query_proj(hidden_states) / (self.head_dim**0.5) if is_cross_attention and past_key_value is not None: # reuse k,v, cross_attentions key_states = past_key_value[0] value_states = past_key_value[1] elif is_cross_attention: # cross_attentions key_states = self._shape(self.key_proj(key_value_states), -1, batch_size) value_states = self._shape(self.value_proj(key_value_states), -1, batch_size) else: # self_attention key_states = self._shape(self.key_proj(hidden_states), -1, batch_size) value_states = self._shape(self.value_proj(hidden_states), -1, batch_size) if is_cross_attention: # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. # Further calls to cross_attention layer can then reuse all cross-attention # key/value_states (first "if" case) # if encoder bi-directional self-attention `past_key_value` is always `None` past_key_value = (key_states, value_states) # project states into the correct shape proj_shape = (batch_size * self.num_attn_heads, -1, self.head_dim) query_states = self._shape(query_states, tgt_len, batch_size).view(*proj_shape) key_states = key_states.view(*proj_shape) value_states = value_states.view(*proj_shape) src_len = key_states.size(1) attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) assert attn_weights.size() == ( batch_size * self.num_attn_heads, tgt_len, src_len, ), ( f"`attn_weights` should be of size {batch_size * self.num_attn_heads, tgt_len, src_len}, but is of size" f" {attn_weights.shape}" ) # This is part of a workaround to get around fork/join parallelism not supporting Optional types. if attention_mask is not None and attention_mask.dim() == 0: attention_mask = None > assert attention_mask is None or attention_mask.size() == ( self.num_attn_heads * batch_size, 1, src_len, ), ( "`attention_mask` should be `None` or of shape attention_mask.size() ==" f" {batch_size * self.num_attn_heads, 1, src_len}, but is {attention_mask.shape}" ) E AssertionError: `attention_mask` should be `None` or of shape attention_mask.size() == (8, 1, 4), but is torch.Size([8, 1, 20]) .env/lib/python3.8/site-packages/transformers/models/prophetnet/modeling_prophetnet.py:725: AssertionError ----------------------------------------------------------- Captured stdout call ------------------------------------------------------------ self.embed_dim: 16 self.in_proj_weight: torch.Size([48, 16]) self.embed_dim: 16 self.in_proj_weight: torch.Size([48, 16]) self.embed_dim: 16 self.in_proj_weight: torch.Size([48, 16]) self.embed_dim: 16 self.in_proj_weight: torch.Size([48, 16]) __________________ BetterTransformersEncoderDecoderTest.test_logits_9_hf_internal_testing_tiny_random_ProphetNetModel_True __________________ a = (,) @wraps(func) def standalone_func(*a): > return func(*(a + p.args), **p.kwargs) .env/lib/python3.8/site-packages/parameterized/parameterized.py:533: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ tests/bettertransformer/test_bettertransformer_encoder.py:291: in test_logits super().test_logits([model_id], padding=padding, max_length=max_length) tests/bettertransformer/testing_bettertransformer_utils.py:95: in test_logits self.assert_equal( tests/bettertransformer/testing_bettertransformer_utils.py:110: in assert_equal self.assertTrue( E AssertionError: False is not true : The BetterTransformer converted model does not produce the same logits as the original model. Failed for the model ProphetNetModel. Maxdiff: 0.003855198621749878 ----------------------------------------------------------- Captured stdout call ------------------------------------------------------------ self.embed_dim: 16 self.in_proj_weight: torch.Size([48, 16]) self.embed_dim: 16 self.in_proj_weight: torch.Size([48, 16]) self.embed_dim: 16 self.in_proj_weight: torch.Size([48, 16]) self.embed_dim: 16 self.in_proj_weight: torch.Size([48, 16]) ============================================================= warnings summary ============================================================== tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderTest::test_logits /home/adit299/optimum/.env/src/optimum/optimum/bettertransformer/models/encoder_models.py:102: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at ../aten/src/ATen/NestedTensorImpl.cpp:175.) hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask) tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderTest::test_logits /home/adit299/optimum/.env/src/optimum/optimum/utils/testing_utils.py:22: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3, and in 3.10 it will stop working if isinstance(v, collections.MutableMapping): tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderTest::test_logits /home/adit299/optimum/.env/lib/python3.8/site-packages/transformers/models/tapas/modeling_tapas.py:1788: UserWarning: scatter_reduce() is in beta and the API may change at any time. (Triggered internally at ../aten/src/ATen/native/TensorAdvancedIndexing.cpp:1615.) segment_means = out.scatter_reduce( tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderDecoderTest::test_logits_1_hf_internal_testing_tiny_random_bart_True tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderDecoderTest::test_logits_3_hf_internal_testing_tiny_random_FSMTModel_True tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderDecoderTest::test_logits_5_hf_internal_testing_tiny_random_mbart_True tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderDecoderTest::test_logits_7_hf_internal_testing_tiny_random_nllb_True tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderDecoderTest::test_logits_9_hf_internal_testing_tiny_random_ProphetNetModel_True /home/adit299/optimum/.env/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2354: UserWarning: `max_length` is ignored when `padding`=`True` and there is no truncation strategy. To pad to max length, use `padding='max_length'`. warnings.warn( -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html ========================================================== short test summary info ========================================================== FAILED tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderDecoderTest::test_logits_8_hf_internal_testing_tiny_random_ProphetNetModel_max_length - AssertionError: `attention_mask` should be `None` or of shape attention_mask.size() == (8, 1, 4), but is torch.Size([8, 1, 20]) FAILED tests/bettertransformer/test_bettertransformer_encoder.py::BetterTransformersEncoderDecoderTest::test_logits_9_hf_internal_testing_tiny_random_ProphetNetModel_True - AssertionError: False is not true : The BetterTransformer converted model does not produce the same logits as the original model. Failed... ========================================== 2 failed, 9 passed, 46 deselected, 8 warnings in 25.34s ==========================================