From 60999c43cc0b49fecc1dda423a0632d76d010fa1 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Sat, 6 Nov 2021 01:07:37 +0500 Subject: [PATCH 1/6] minor modification to the wav2vec2 modeling file to support tensor-parallelism with DeepSpeed on this HuggingFace model --- .../models/wav2vec2/modeling_wav2vec2.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 6548f245f0e842..6c934f129ac2b5 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -493,7 +493,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This is required in case of using tensor-parallelism + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -579,7 +582,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the class's embed_dim rather than input's, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) From 656cf94b0305e350af670a3b70e540854238edeb Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Sat, 6 Nov 2021 02:09:04 +0500 Subject: [PATCH 2/6] refine the comments --- src/transformers/models/wav2vec2/modeling_wav2vec2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 6c934f129ac2b5..5a1f5ce207df47 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -495,7 +495,7 @@ def forward( is_cross_attention = key_value_states is not None # Use the class's parameter as the hidden_state's last dimension. - # This is required in case of using tensor-parallelism + # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -583,7 +583,7 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the class's embed_dim rather than input's, this is due to + # Use the embed_dim from class rather than hidden_state, this is due to # the reason that attn_output can be partitioned across GPUs # when using tensor-parallelism, in which case the embed_dimension from # the input is not equal to the attention's last dimension after merging From bc9743a8f4ff98feeb094256e6df72dbfea6f6b0 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Sat, 6 Nov 2021 05:11:28 +0500 Subject: [PATCH 3/6] synch changes --- src/transformers/models/bart/modeling_bart.py | 13 +++++++++++-- .../bigbird_pegasus/modeling_bigbird_pegasus.py | 13 +++++++++++-- .../models/blenderbot/modeling_blenderbot.py | 13 +++++++++++-- .../blenderbot_small/modeling_blenderbot_small.py | 13 +++++++++++-- src/transformers/models/hubert/modeling_hubert.py | 13 +++++++++++-- src/transformers/models/m2m_100/modeling_m2m_100.py | 13 +++++++++++-- src/transformers/models/marian/modeling_marian.py | 13 +++++++++++-- src/transformers/models/mbart/modeling_mbart.py | 13 +++++++++++-- src/transformers/models/pegasus/modeling_pegasus.py | 13 +++++++++++-- src/transformers/models/sew/modeling_sew.py | 13 +++++++++++-- .../speech_to_text/modeling_speech_to_text.py | 13 +++++++++++-- .../speech_to_text_2/modeling_speech_to_text_2.py | 13 +++++++++++-- .../models/unispeech/modeling_unispeech.py | 13 +++++++++++-- .../models/unispeech_sat/modeling_unispeech_sat.py | 13 +++++++++++-- .../models/wav2vec2/modeling_wav2vec2.py | 2 +- 15 files changed, 155 insertions(+), 29 deletions(-) diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 8dc57874571a63..667cdb4ca08bb7 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -171,7 +171,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -257,7 +260,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 68d9ae0c0e5595..b3e02240217699 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -1244,7 +1244,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -1330,7 +1333,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index ceb32829cb271e..adc36056944982 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -173,7 +173,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -259,7 +262,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index a1009ae11fc338..c160c57bf7eedf 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -171,7 +171,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -257,7 +260,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 0d1cd112537144..952b0c6ffc5674 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -388,7 +388,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -474,7 +477,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index b9e5dcbe7ba4ba..8bde44e1ca0e0b 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -242,7 +242,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -328,7 +331,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index a0ac87ede93a76..f3f6721d37473e 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -188,7 +188,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -274,7 +277,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 17a8dde45da815..db42034624b3cd 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -177,7 +177,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -263,7 +266,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index 00956ac7560ba6..f996fb8231f86e 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -188,7 +188,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -274,7 +277,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 1f01d1cf572d75..88752669c27022 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -387,7 +387,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -473,7 +476,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 35a2efab30af42..8d463a3570c119 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -255,7 +255,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -341,7 +344,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py index edcc5e486e8c39..e4d3505d5491a1 100755 --- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py @@ -195,7 +195,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -281,7 +284,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index a8a89c302b75d1..1586f49abf9b51 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -456,7 +456,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -542,7 +545,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index c5f8243bf11524..2cb5e6e1f47067 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -457,7 +457,10 @@ def forward( # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None - bsz, tgt_len, embed_dim = hidden_states.size() + + # Use the class's parameter as the hidden_state's last dimension. + # This dimension cannot be used in case of enabling tensor-parallelism. + bsz, tgt_len, _ = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scaling @@ -543,7 +546,13 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + # Use the embed_dim from class rather than hidden_state, this is due to + # the reason that attn_output can be partitioned across GPUs + # when using tensor-parallelism, in which case the embed_dimension from + # the input is not equal to the attention's last dimension after merging + # heads. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 5a1f5ce207df47..874242df2acfc3 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -585,7 +585,7 @@ def forward( # Use the embed_dim from class rather than hidden_state, this is due to # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from + # when using tensor-parallelism, in which case the embed_dimension from # the input is not equal to the attention's last dimension after merging # heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) From 6d2aecf4b8257ef68fc48688a25ca6bccbb14d72 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Tue, 9 Nov 2021 05:22:20 +0500 Subject: [PATCH 4/6] fix comments --- src/transformers/models/bart/modeling_bart.py | 10 +++------- .../models/bigbird_pegasus/modeling_bigbird_pegasus.py | 10 +++------- .../models/blenderbot/modeling_blenderbot.py | 10 +++------- .../blenderbot_small/modeling_blenderbot_small.py | 10 +++------- src/transformers/models/hubert/modeling_hubert.py | 10 +++------- src/transformers/models/m2m_100/modeling_m2m_100.py | 10 +++------- src/transformers/models/marian/modeling_marian.py | 10 +++------- src/transformers/models/mbart/modeling_mbart.py | 10 +++------- src/transformers/models/pegasus/modeling_pegasus.py | 10 +++------- src/transformers/models/sew/modeling_sew.py | 10 +++------- .../models/speech_to_text/modeling_speech_to_text.py | 10 +++------- .../speech_to_text_2/modeling_speech_to_text_2.py | 10 +++------- .../models/unispeech/modeling_unispeech.py | 10 +++------- .../models/unispeech_sat/modeling_unispeech_sat.py | 10 +++------- src/transformers/models/wav2vec2/modeling_wav2vec2.py | 10 +++------- 15 files changed, 45 insertions(+), 105 deletions(-) diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 667cdb4ca08bb7..e59ac838126dd7 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -172,8 +172,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -261,11 +259,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index b3e02240217699..64ce62c595eb9d 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -1245,8 +1245,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -1334,11 +1332,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index adc36056944982..25712213cc47b3 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -174,8 +174,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -263,11 +261,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index c160c57bf7eedf..4c37b73f371214 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -172,8 +172,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -261,11 +259,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 952b0c6ffc5674..9b6fcbad75093b 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -389,8 +389,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -478,11 +476,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index 8bde44e1ca0e0b..e64fa7172145b4 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -243,8 +243,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -332,11 +330,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index f3f6721d37473e..a9edc8eaae9439 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -189,8 +189,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -278,11 +276,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index db42034624b3cd..1b1c6e5c5c2e3b 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -178,8 +178,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -267,11 +265,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index f996fb8231f86e..cbcf8d4593fcf3 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -189,8 +189,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -278,11 +276,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 88752669c27022..19136ba74d761a 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -388,8 +388,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -477,11 +475,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 8d463a3570c119..7aef2c3f73c1b9 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -256,8 +256,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -345,11 +343,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py index e4d3505d5491a1..e114edf1008070 100755 --- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py @@ -196,8 +196,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -285,11 +283,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 1586f49abf9b51..e2c2f73c33c3a9 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -457,8 +457,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -546,11 +544,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 2cb5e6e1f47067..8ec7305f0aff0a 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -458,8 +458,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -547,11 +545,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 874242df2acfc3..d0048b1fc70592 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -494,8 +494,6 @@ def forward( # for the decoder is_cross_attention = key_value_states is not None - # Use the class's parameter as the hidden_state's last dimension. - # This dimension cannot be used in case of enabling tensor-parallelism. bsz, tgt_len, _ = hidden_states.size() # get query proj @@ -583,11 +581,9 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to - # the reason that attn_output can be partitioned across GPUs - # when using tensor-parallelism, in which case the embed_dimension from - # the input is not equal to the attention's last dimension after merging - # heads. + # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned + # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the + # attention's last dimension after merging heads. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) From 31190d7654af85274653875b01974922f3eccdb9 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Tue, 9 Nov 2021 05:26:51 +0500 Subject: [PATCH 5/6] refine comments --- src/transformers/models/bart/modeling_bart.py | 5 ++--- .../models/bigbird_pegasus/modeling_bigbird_pegasus.py | 5 ++--- src/transformers/models/blenderbot/modeling_blenderbot.py | 5 ++--- .../models/blenderbot_small/modeling_blenderbot_small.py | 5 ++--- src/transformers/models/hubert/modeling_hubert.py | 5 ++--- src/transformers/models/m2m_100/modeling_m2m_100.py | 5 ++--- src/transformers/models/marian/modeling_marian.py | 5 ++--- src/transformers/models/mbart/modeling_mbart.py | 5 ++--- src/transformers/models/pegasus/modeling_pegasus.py | 5 ++--- src/transformers/models/sew/modeling_sew.py | 5 ++--- .../models/speech_to_text/modeling_speech_to_text.py | 5 ++--- .../models/speech_to_text_2/modeling_speech_to_text_2.py | 5 ++--- src/transformers/models/unispeech/modeling_unispeech.py | 5 ++--- .../models/unispeech_sat/modeling_unispeech_sat.py | 5 ++--- src/transformers/models/wav2vec2/modeling_wav2vec2.py | 5 ++--- 15 files changed, 30 insertions(+), 45 deletions(-) diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index e59ac838126dd7..f963a12803535d 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -259,9 +259,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 64ce62c595eb9d..e22621c3d767d6 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -1332,9 +1332,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index 25712213cc47b3..1911cd9e954047 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -261,9 +261,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index 4c37b73f371214..26dd44d9f068e5 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -259,9 +259,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 9b6fcbad75093b..fcd0568372e799 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -476,9 +476,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index e64fa7172145b4..1230bf01e78e8f 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -330,9 +330,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index a9edc8eaae9439..94f0f800bd55af 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -276,9 +276,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 1b1c6e5c5c2e3b..564030fb49c7a3 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -265,9 +265,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index cbcf8d4593fcf3..33a1ca14cb72c7 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -276,9 +276,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 19136ba74d761a..8e59609776da8f 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -475,9 +475,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 7aef2c3f73c1b9..e631e75731c109 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -343,9 +343,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py index e114edf1008070..306cacd48f9ac6 100755 --- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py @@ -283,9 +283,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index e2c2f73c33c3a9..ad1946cd39d690 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -544,9 +544,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 8ec7305f0aff0a..4f8c3c91c3d12b 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -545,9 +545,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index d0048b1fc70592..e7c72bd8718d9e 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -581,9 +581,8 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the embed_dim from class rather than hidden_state, this is due to the reason that attn_output can be partitioned - # across GPUs when using tensor-parallelism, in which case the embed_dimension from the input is not equal to the - # attention's last dimension after merging heads. + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = self.out_proj(attn_output) From 5570c3cb15fd5d8aa2c5e140039193c362e79f26 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Tue, 9 Nov 2021 06:44:49 +0500 Subject: [PATCH 6/6] fix format --- src/transformers/models/bart/modeling_bart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index f963a12803535d..f479a9069b0c04 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -259,7 +259,7 @@ def forward( attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) - # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be # partitioned aross GPUs when using tensor-parallelism. attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)