Skip to content

Commit

Permalink
fix save issue of deepspeed zero3 (#828)
Browse files Browse the repository at this point in the history
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
  • Loading branch information
sywangyi committed Nov 30, 2023
1 parent 464962e commit cf5ff82
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 9 deletions.
8 changes: 2 additions & 6 deletions intel_extension_for_transformers/llm/finetuning/finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,15 +553,11 @@ def concatenate_data(dataset, max_seq_length):
)

trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
with training_args.main_process_first(desc="save model"):
if is_main_process(training_args.local_rank):
unwrapped_model = unwrap_model(model)
unwrapped_model.save_pretrained(
training_args.output_dir, state_dict=unwrapped_model.state_dict()
)
trainer.save_model()
if finetune_args.do_lm_eval and finetune_args.task == "code-generation":
tokenizer.padding_side = "right" # padding on the right is needed to cut off padding in `complete_code`
tokenizer.truncation_side = "left"
unwrapped_model = unwrap_model(model)
unwrapped_model.eval()
class Eval_Args:
n_samples = 20
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"zero_optimization": {
"stage": 3,
"overlap_comm": false,
"contiguous_gradients": false
"contiguous_gradients": false,
"stage3_gather_16bit_weights_on_model_save": true
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -475,5 +475,4 @@ def compute_metrics(eval_pred):
trainer.train()

trainer.model = trainer.model.merge_and_unload()
if trainer.is_world_process_zero():
trainer.save_model()
trainer.save_model()

0 comments on commit cf5ff82

Please sign in to comment.