From 60999c43cc0b49fecc1dda423a0632d76d010fa1 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sat, 6 Nov 2021 01:07:37 +0500
Subject: [PATCH 1/6] minor modification to the wav2vec2 modeling file to
 support tensor-parallelism with DeepSpeed on this HuggingFace model

---
 .../models/wav2vec2/modeling_wav2vec2.py            | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 6548f245f0e842..6c934f129ac2b5 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -493,7 +493,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This is required in case of using tensor-parallelism
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -579,7 +582,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the class's embed_dim rather than input's, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from 
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 

From 656cf94b0305e350af670a3b70e540854238edeb Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sat, 6 Nov 2021 02:09:04 +0500
Subject: [PATCH 2/6] refine the comments

---
 src/transformers/models/wav2vec2/modeling_wav2vec2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 6c934f129ac2b5..5a1f5ce207df47 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -495,7 +495,7 @@ def forward(
         is_cross_attention = key_value_states is not None
 
         # Use the class's parameter as the hidden_state's last dimension.
-        # This is required in case of using tensor-parallelism
+        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -583,7 +583,7 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the class's embed_dim rather than input's, this is due to
+        # Use the embed_dim from class rather than hidden_state, this is due to
         # the reason that attn_output can be partitioned across GPUs
         # when using tensor-parallelism, in which case the embed_dimension from 
         # the input is not equal to the attention's last dimension after merging

From bc9743a8f4ff98feeb094256e6df72dbfea6f6b0 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sat, 6 Nov 2021 05:11:28 +0500
Subject: [PATCH 3/6] synch changes

---
 src/transformers/models/bart/modeling_bart.py       | 13 +++++++++++--
 .../bigbird_pegasus/modeling_bigbird_pegasus.py     | 13 +++++++++++--
 .../models/blenderbot/modeling_blenderbot.py        | 13 +++++++++++--
 .../blenderbot_small/modeling_blenderbot_small.py   | 13 +++++++++++--
 src/transformers/models/hubert/modeling_hubert.py   | 13 +++++++++++--
 src/transformers/models/m2m_100/modeling_m2m_100.py | 13 +++++++++++--
 src/transformers/models/marian/modeling_marian.py   | 13 +++++++++++--
 src/transformers/models/mbart/modeling_mbart.py     | 13 +++++++++++--
 src/transformers/models/pegasus/modeling_pegasus.py | 13 +++++++++++--
 src/transformers/models/sew/modeling_sew.py         | 13 +++++++++++--
 .../speech_to_text/modeling_speech_to_text.py       | 13 +++++++++++--
 .../speech_to_text_2/modeling_speech_to_text_2.py   | 13 +++++++++++--
 .../models/unispeech/modeling_unispeech.py          | 13 +++++++++++--
 .../models/unispeech_sat/modeling_unispeech_sat.py  | 13 +++++++++++--
 .../models/wav2vec2/modeling_wav2vec2.py            |  2 +-
 15 files changed, 155 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 8dc57874571a63..667cdb4ca08bb7 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -171,7 +171,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -257,7 +260,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 68d9ae0c0e5595..b3e02240217699 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -1244,7 +1244,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -1330,7 +1333,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index ceb32829cb271e..adc36056944982 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -173,7 +173,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -259,7 +262,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index a1009ae11fc338..c160c57bf7eedf 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -171,7 +171,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -257,7 +260,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 0d1cd112537144..952b0c6ffc5674 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -388,7 +388,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -474,7 +477,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index b9e5dcbe7ba4ba..8bde44e1ca0e0b 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -242,7 +242,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -328,7 +331,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index a0ac87ede93a76..f3f6721d37473e 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -188,7 +188,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -274,7 +277,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 17a8dde45da815..db42034624b3cd 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -177,7 +177,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -263,7 +266,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 00956ac7560ba6..f996fb8231f86e 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -188,7 +188,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -274,7 +277,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 1f01d1cf572d75..88752669c27022 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -387,7 +387,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -473,7 +476,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index 35a2efab30af42..8d463a3570c119 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -255,7 +255,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -341,7 +344,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index edcc5e486e8c39..e4d3505d5491a1 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -195,7 +195,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -281,7 +284,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index a8a89c302b75d1..1586f49abf9b51 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -456,7 +456,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -542,7 +545,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index c5f8243bf11524..2cb5e6e1f47067 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -457,7 +457,10 @@ def forward(
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # Use the class's parameter as the hidden_state's last dimension.
+        # This dimension cannot be used in case of enabling tensor-parallelism.
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
@@ -543,7 +546,13 @@ def forward(
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        # Use the embed_dim from class rather than hidden_state, this is due to
+        # the reason that attn_output can be partitioned across GPUs
+        # when using tensor-parallelism, in which case the embed_dimension from
+        # the input is not equal to the attention's last dimension after merging
+        # heads.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
 
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 5a1f5ce207df47..874242df2acfc3 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -585,7 +585,7 @@ def forward(
 
         # Use the embed_dim from class rather than hidden_state, this is due to
         # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from 
+        # when using tensor-parallelism, in which case the embed_dimension from
         # the input is not equal to the attention's last dimension after merging
         # heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

From 6d2aecf4b8257ef68fc48688a25ca6bccbb14d72 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Tue, 9 Nov 2021 05:22:20 +0500
Subject: [PATCH 4/6] fix comments

---
 src/transformers/models/bart/modeling_bart.py          | 10 +++-------
 .../models/bigbird_pegasus/modeling_bigbird_pegasus.py | 10 +++-------
 .../models/blenderbot/modeling_blenderbot.py           | 10 +++-------
 .../blenderbot_small/modeling_blenderbot_small.py      | 10 +++-------
 src/transformers/models/hubert/modeling_hubert.py      | 10 +++-------
 src/transformers/models/m2m_100/modeling_m2m_100.py    | 10 +++-------
 src/transformers/models/marian/modeling_marian.py      | 10 +++-------
 src/transformers/models/mbart/modeling_mbart.py        | 10 +++-------
 src/transformers/models/pegasus/modeling_pegasus.py    | 10 +++-------
 src/transformers/models/sew/modeling_sew.py            | 10 +++-------
 .../models/speech_to_text/modeling_speech_to_text.py   | 10 +++-------
 .../speech_to_text_2/modeling_speech_to_text_2.py      | 10 +++-------
 .../models/unispeech/modeling_unispeech.py             | 10 +++-------
 .../models/unispeech_sat/modeling_unispeech_sat.py     | 10 +++-------
 src/transformers/models/wav2vec2/modeling_wav2vec2.py  | 10 +++-------
 15 files changed, 45 insertions(+), 105 deletions(-)

diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 667cdb4ca08bb7..e59ac838126dd7 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -172,8 +172,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -261,11 +259,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned 
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index b3e02240217699..64ce62c595eb9d 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -1245,8 +1245,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -1334,11 +1332,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index adc36056944982..25712213cc47b3 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -174,8 +174,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -263,11 +261,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index c160c57bf7eedf..4c37b73f371214 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -172,8 +172,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -261,11 +259,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 952b0c6ffc5674..9b6fcbad75093b 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -389,8 +389,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -478,11 +476,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index 8bde44e1ca0e0b..e64fa7172145b4 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -243,8 +243,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -332,11 +330,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index f3f6721d37473e..a9edc8eaae9439 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -189,8 +189,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -278,11 +276,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index db42034624b3cd..1b1c6e5c5c2e3b 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -178,8 +178,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -267,11 +265,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index f996fb8231f86e..cbcf8d4593fcf3 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -189,8 +189,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -278,11 +276,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 88752669c27022..19136ba74d761a 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -388,8 +388,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -477,11 +475,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index 8d463a3570c119..7aef2c3f73c1b9 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -256,8 +256,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -345,11 +343,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index e4d3505d5491a1..e114edf1008070 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -196,8 +196,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -285,11 +283,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index 1586f49abf9b51..e2c2f73c33c3a9 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -457,8 +457,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -546,11 +544,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 2cb5e6e1f47067..8ec7305f0aff0a 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -458,8 +458,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -547,11 +545,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 874242df2acfc3..d0048b1fc70592 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -494,8 +494,6 @@ def forward(
         # for the decoder
         is_cross_attention = key_value_states is not None
 
-        # Use the class's parameter as the hidden_state's last dimension.
-        # This dimension cannot be used in case of enabling tensor-parallelism.
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
@@ -583,11 +581,9 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to
-        # the reason that attn_output can be partitioned across GPUs
-        # when using tensor-parallelism, in which case the embed_dimension from
-        # the input is not equal to the attention's last dimension after merging
-        # heads.
+        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
+        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
+        # attention's last dimension after merging heads.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)

From 31190d7654af85274653875b01974922f3eccdb9 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Tue, 9 Nov 2021 05:26:51 +0500
Subject: [PATCH 5/6] refine comments

---
 src/transformers/models/bart/modeling_bart.py                | 5 ++---
 .../models/bigbird_pegasus/modeling_bigbird_pegasus.py       | 5 ++---
 src/transformers/models/blenderbot/modeling_blenderbot.py    | 5 ++---
 .../models/blenderbot_small/modeling_blenderbot_small.py     | 5 ++---
 src/transformers/models/hubert/modeling_hubert.py            | 5 ++---
 src/transformers/models/m2m_100/modeling_m2m_100.py          | 5 ++---
 src/transformers/models/marian/modeling_marian.py            | 5 ++---
 src/transformers/models/mbart/modeling_mbart.py              | 5 ++---
 src/transformers/models/pegasus/modeling_pegasus.py          | 5 ++---
 src/transformers/models/sew/modeling_sew.py                  | 5 ++---
 .../models/speech_to_text/modeling_speech_to_text.py         | 5 ++---
 .../models/speech_to_text_2/modeling_speech_to_text_2.py     | 5 ++---
 src/transformers/models/unispeech/modeling_unispeech.py      | 5 ++---
 .../models/unispeech_sat/modeling_unispeech_sat.py           | 5 ++---
 src/transformers/models/wav2vec2/modeling_wav2vec2.py        | 5 ++---
 15 files changed, 30 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index e59ac838126dd7..f963a12803535d 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -259,9 +259,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned 
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be 
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 64ce62c595eb9d..e22621c3d767d6 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -1332,9 +1332,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 25712213cc47b3..1911cd9e954047 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -261,9 +261,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 4c37b73f371214..26dd44d9f068e5 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -259,9 +259,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 9b6fcbad75093b..fcd0568372e799 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -476,9 +476,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index e64fa7172145b4..1230bf01e78e8f 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -330,9 +330,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index a9edc8eaae9439..94f0f800bd55af 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -276,9 +276,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 1b1c6e5c5c2e3b..564030fb49c7a3 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -265,9 +265,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index cbcf8d4593fcf3..33a1ca14cb72c7 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -276,9 +276,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 19136ba74d761a..8e59609776da8f 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -475,9 +475,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index 7aef2c3f73c1b9..e631e75731c109 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -343,9 +343,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index e114edf1008070..306cacd48f9ac6 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -283,9 +283,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index e2c2f73c33c3a9..ad1946cd39d690 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -544,9 +544,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 8ec7305f0aff0a..4f8c3c91c3d12b 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -545,9 +545,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index d0048b1fc70592..e7c72bd8718d9e 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -581,9 +581,8 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned
-        # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the
-        # attention's last dimension after merging heads.
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)

From 5570c3cb15fd5d8aa2c5e140039193c362e79f26 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Tue, 9 Nov 2021 06:44:49 +0500
Subject: [PATCH 6/6] fix format

---
 src/transformers/models/bart/modeling_bart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index f963a12803535d..f479a9069b0c04 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -259,7 +259,7 @@ def forward(
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be 
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
         # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)