Skip to content

Commit

Permalink
Fix issues with PL 1.8 (NVIDIA#5353)
Browse files Browse the repository at this point in the history
* Fix issues with PL 1.8

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Set scripting variable

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Fix missing arg

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Cleanup list

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Fix reference

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Try to fix hanging EMA test

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Missing \

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Add strategy

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* See if setting the chdir fixes the hanging DDP test

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* See if removing the subdir setter fixes the issue

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Remove checks

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Try [0,1] for devices

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Add code back

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Remove space

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Update requirements

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Swap import path

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Update references

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Fix deprecated variables

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Fix missing var

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Fix var

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Revert changes

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

* Address review

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>
Co-authored-by: Oleksii Kuchaiev <okuchaiev@users.noreply.github.com>
  • Loading branch information
SeanNaren and okuchaiev committed Nov 28, 2022
1 parent 21b088b commit d349f4a
Show file tree
Hide file tree
Showing 39 changed files with 66 additions and 94 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelSummary
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

import torch
from apex.transformer import parallel_state
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from lightning_lite.plugins.environments import TorchElasticEnvironment
from pytorch_lightning.trainer.trainer import Trainer

from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/language_modeling/megatron_gpt_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment

from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import (
MegatronGPTPromptLearningModel,
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/language_modeling/megatron_retro_cal_shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin

from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelSummary
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/language_modeling/megatron_t5_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelSummary
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment

from nemo.collections.nlp.models.language_modeling.megatron_t5_prompt_learning_model import (
MegatronT5PromptLearningModel,
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from lightning_lite.plugins.environments import TorchElasticEnvironment
from megatron_t5_seq2seq_finetune import load_from_checkpoint_dir, load_from_nemo, validate_checkpoint_loading_args
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin

from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
import os
import tempfile

from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment

from nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model import MegatronGPTAdapterLearningModel
from nemo.collections.nlp.parts.nlp_overrides import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment

from nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model import MegatronGPTInfusedAdapterModel
from nemo.collections.nlp.parts.nlp_overrides import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment

from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5AdapterLearningModel
from nemo.collections.nlp.parts.nlp_overrides import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment

from nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model import MegatronT5InfusedAdapterModel
from nemo.collections.nlp.parts.nlp_overrides import (
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/machine_translation/megatron_nmt_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
# limitations under the License.


from lightning_lite.plugins.environments import TorchElasticEnvironment
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelSummary
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector

from nemo.collections.nlp.models.language_modeling.megatron_bart_model import MegatronBARTModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@ def prepare_data(self):
self.data_prepared = True

def setup(self, stage=None):
super().setup()
super().setup(stage)
if self.cfg.library == "megatron" and self.prompt_learning and stage == "fit":
if self.cfg.virtual_prompt_style == VirtualPromptStyle.PROMPT_TUNING:
self.language_model.init_new_prompts()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def mask_and_reduce_loss(self, loss_mask, output_tensor):
return loss

def setup(self, stage=None):
super().setup()
super().setup(stage)
if self.cfg.library == "megatron" and self.prompt_learning:
self.language_model.init_new_prompts()

Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/nlp/models/dialogue/sgdqa_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def eval_step_helper(self, batch: List[torch.Tensor]):
all_start_char_idx = []
all_end_char_idx = []

if self.trainer.devices and self.trainer.world_size > 1:
if self.trainer.num_devices and self.trainer.world_size > 1:
world_size = self.trainer.world_size
for ind in range(world_size):
all_example_id_num.append(torch.empty_like(example_id_num))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -302,16 +302,16 @@ def on_train_batch_end(self, outputs, batch, batch_idx: int, unused: Optional[in
# If the grad scaler skipped its optimizer step due to infs/nans,
# decrement the step of all schedulers.
if grad_scaler.optimizer_update_skipped is not None and grad_scaler.optimizer_update_skipped is True:
schedulers = self.trainer.lr_schedulers
scheduler_cfgs = self.trainer.lr_scheduler_configs

if not schedulers or not self.trainer.lightning_module.automatic_optimization:
if not scheduler_cfgs or not self.trainer.lightning_module.automatic_optimization:
return

for scheduler in schedulers:
for scheduler_cfg in scheduler_cfgs:
# Decrement the counter by 2, then perform a scheduler.step() to perform a no-up
# as well as update the optimizer lr in all param groups
scheduler['scheduler'].last_epoch -= 2
scheduler['scheduler'].step()
scheduler_cfg.scheduler.last_epoch -= 2
scheduler_cfg.scheduler.step()

# Removing the line below because it messes up train_valid_test_num_samples calculation.
# self.trainer.fit_loop.max_steps = self.trainer.fit_loop.max_steps + 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -309,16 +309,16 @@ def on_train_batch_end(self, outputs, batch, batch_idx: int, unused: Optional[in
# If the grad scaler skipped its optimizer step due to infs/nans,
# decrement the step of all schedulers.
if grad_scaler.optimizer_update_skipped is not None and grad_scaler.optimizer_update_skipped is True:
schedulers = self.trainer.lr_schedulers
scheduler_cfgs = self.trainer.lr_scheduler_configs

if not schedulers or not self.trainer.lightning_module.automatic_optimization:
if not scheduler_cfgs or not self.trainer.lightning_module.automatic_optimization:
return

for scheduler in schedulers:
for scheduler_cfg in scheduler_cfgs:
# Decrement the counter by 2, then perform a scheduler.step() to perform a no-up
# as well as update the optimizer lr in all param groups
scheduler['scheduler'].last_epoch -= 2
scheduler['scheduler'].step()
scheduler_cfg.scheduler.last_epoch -= 2
scheduler_cfg.scheduler.step()

# Increase the max step count by 1

Expand Down
4 changes: 2 additions & 2 deletions nemo/collections/nlp/parts/nlp_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@

import pytorch_lightning as pl
import torch
from lightning_lite.plugins import ClusterEnvironment
from lightning_lite.utilities.types import _PATH
from omegaconf import OmegaConf
from pytorch_lightning.overrides import LightningDistributedModule
from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
from pytorch_lightning.strategies.ddp import DDPStrategy
from pytorch_lightning.trainer.trainer import Trainer
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.fetching import DataFetcher
from pytorch_lightning.utilities.types import _PATH
from torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks import noop_hook
from torch.nn.parallel import DistributedDataParallel

Expand Down
11 changes: 5 additions & 6 deletions nemo/collections/tts/models/fastpitch.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from hydra.utils import instantiate
from omegaconf import DictConfig, OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger
from pytorch_lightning.loggers import TensorBoardLogger

from nemo.collections.common.parts.preprocessing import parsers
from nemo.collections.tts.helpers.helpers import plot_alignment_to_numpy, plot_spectrogram_to_numpy, process_batch
Expand Down Expand Up @@ -219,11 +219,10 @@ def tb_logger(self):
if self.logger is None and self.logger.experiment is None:
return None
tb_logger = self.logger.experiment
if isinstance(self.logger, LoggerCollection):
for logger in self.logger:
if isinstance(logger, TensorBoardLogger):
tb_logger = logger.experiment
break
for logger in self.trainer.loggers:
if isinstance(logger, TensorBoardLogger):
tb_logger = logger.experiment
break
self._tb_logger = tb_logger
return self._tb_logger

Expand Down
11 changes: 5 additions & 6 deletions nemo/collections/tts/models/radtts.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from hydra.utils import instantiate
from omegaconf import DictConfig, OmegaConf
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger
from pytorch_lightning.loggers import TensorBoardLogger

from nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers import BaseTokenizer
from nemo.collections.tts.helpers.helpers import plot_alignment_to_numpy
Expand Down Expand Up @@ -388,11 +388,10 @@ def tb_logger(self):
if self.logger is None and self.logger.experiment is None:
return None
tb_logger = self.logger.experiment
if isinstance(self.logger, LoggerCollection):
for logger in self.logger:
if isinstance(logger, TensorBoardLogger):
tb_logger = logger.experiment
break
for logger in self.trainer.loggers:
if isinstance(logger, TensorBoardLogger):
tb_logger = logger.experiment
break
self._tb_logger = tb_logger
return self._tb_logger

Expand Down
11 changes: 5 additions & 6 deletions nemo/collections/tts/models/tacotron2.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from hydra.utils import instantiate
from omegaconf import MISSING, DictConfig, OmegaConf, open_dict
from omegaconf.errors import ConfigAttributeError
from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger, WandbLogger
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from torch import nn

from nemo.collections.common.parts.preprocessing import parsers
Expand Down Expand Up @@ -284,11 +284,10 @@ def validation_step(self, batch, batch_idx):
def validation_epoch_end(self, outputs):
if self.logger is not None and self.logger.experiment is not None:
logger = self.logger.experiment
if isinstance(self.logger, LoggerCollection):
for logger in self.logger:
if isinstance(logger, TensorBoardLogger):
logger = logger.experiment
break
for logger in self.trainer.loggers:
if isinstance(logger, TensorBoardLogger):
logger = logger.experiment
break
if isinstance(logger, TensorBoardLogger):
tacotron2_log_to_tb_func(
logger, outputs[0].values(), self.global_step, tag="val", log_images=True, add_audio=False,
Expand Down
Loading

0 comments on commit d349f4a

Please sign in to comment.