Skip to content

Commit

Permalink
[shardformer] update gpt2 (#5502)
Browse files Browse the repository at this point in the history
  • Loading branch information
wangbluo committed Apr 3, 2024
1 parent 7686f4e commit fd44440
Showing 1 changed file with 6 additions and 14 deletions.
20 changes: 6 additions & 14 deletions colossalai/shardformer/modeling/gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,9 @@ def gpt2_model_forward(
head_mask = self.get_head_mask(head_mask, self.config.n_layer)

if stage_manager.is_first_stage():
if position_ids is not None:
position_ids = position_ids.view(-1, input_shape[-1])
else:
if position_ids is None:
position_ids = torch.arange(0, input_shape[-1], dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
position_ids = position_ids.unsqueeze(0)

if inputs_embeds is None:
inputs_embeds = self.wte(input_ids)
Expand Down Expand Up @@ -188,22 +186,16 @@ def gpt2_model_forward(
all_hidden_states = all_hidden_states + (hidden_states,)

if self.gradient_checkpointing and self.training:

def create_custom_forward(module):
def custom_forward(*inputs):
# None for past_key_value
return module(*inputs, use_cache, output_attentions)

return custom_forward

outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(block),
outputs = self._gradient_checkpointing_func(
block.__call__,
hidden_states,
None,
attention_mask,
head_mask[i],
encoder_hidden_states,
encoder_attention_mask,
use_cache,
output_attentions,
)
else:
outputs = block(
Expand Down

0 comments on commit fd44440

Please sign in to comment.