huggingface · sgugger · Nov 9, 2021 · Nov 5, 2021 · Nov 5, 2021 · Nov 6, 2021
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
@@ -171,7 +171,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -257,7 +260,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -1244,7 +1244,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -1330,7 +1333,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -173,7 +173,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -259,7 +262,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -171,7 +171,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -257,7 +260,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
@@ -388,7 +388,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -474,7 +477,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -242,7 +242,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -328,7 +331,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
@@ -188,7 +188,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -274,7 +277,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
@@ -177,7 +177,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -263,7 +266,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -188,7 +188,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -274,7 +277,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
@@ -387,7 +387,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -473,7 +476,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -255,7 +255,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -341,7 +344,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -195,7 +195,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -281,7 +284,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -456,7 +456,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -542,7 +545,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -457,7 +457,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -543,7 +546,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)