Skip to content

Commit

Permalink
try appending version to already saved ckpt_file
Browse files Browse the repository at this point in the history
  • Loading branch information
rohitgr7 committed Dec 1, 2020
1 parent 41ae295 commit 710e05e
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 7 deletions.
28 changes: 22 additions & 6 deletions pytorch_lightning/callbacks/model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,8 +504,24 @@ def _get_metric_interpolated_filepath_name(
) -> str:
filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics)

version_cnt = 0
while self._fs.exists(filepath) and filepath != del_filepath:
version_cnt = 1
old_ckpt_ver_0 = self.format_checkpoint_name(epoch, step, ckpt_name_metrics, ver=0)
while (
self._fs.exists(filepath)
or (self._fs.exists(old_ckpt_ver_0) and version_cnt == 1)
):
if del_filepath == filepath:
return filepath

if del_filepath == old_ckpt_ver_0:
return old_ckpt_ver_0

if self._fs.exists(filepath):
self._fs.rename(filepath, old_ckpt_ver_0)
old_ckpt_score = self.best_k_models[filepath]
self.best_k_models.pop(filepath)
self.best_k_models[old_ckpt_ver_0] = old_ckpt_score

filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics, ver=version_cnt)
version_cnt += 1

Expand All @@ -523,10 +539,6 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics):
if not should_save_last:
return

last_filepath = self._get_metric_interpolated_filepath_name(
ckpt_name_metrics, trainer.current_epoch, trainer.global_step
)

# when user ALSO asked for the 'last.ckpt' change the name
if self.save_last:
last_filepath = self._format_checkpoint_name(
Expand All @@ -537,6 +549,10 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics):
prefix=self.prefix
)
last_filepath = os.path.join(self.dirpath, f"{last_filepath}.ckpt")
else:
last_filepath = self._get_metric_interpolated_filepath_name(
ckpt_name_metrics, trainer.current_epoch, trainer.global_step
)

self._save_model(last_filepath, trainer, pl_module)
if (
Expand Down
5 changes: 4 additions & 1 deletion tests/checkpointing/test_model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -993,7 +993,7 @@ def __init__(self, hparams):
'save_top_k, expected',
[
(1, ['curr_epoch.ckpt']),
(2, ['curr_epoch.ckpt', 'curr_epoch-v0.ckpt']),
(2, ['curr_epoch-v0.ckpt', 'curr_epoch-v1.ckpt']),
]
)
def test_model_checkpoint_file_already_exists(tmpdir, max_epochs, save_top_k, expected):
Expand Down Expand Up @@ -1026,3 +1026,6 @@ def validation_epoch_end(self, outputs):
trainer.fit(model)
ckpt_files = os.listdir(tmpdir)
assert set(ckpt_files) == set(expected)

expected_epoch_in_files = sorted([pl_load(os.path.join(tmpdir, f))['epoch'] - 1 for f in ckpt_files])
assert expected_epoch_in_files == sorted(list(range(max_epochs))[-save_top_k:])

0 comments on commit 710e05e

Please sign in to comment.