native amp (Lightning-AI#2373)

Borda · web-flow · commit 0be78d13aad1 · 2020-06-26T21:45:13.000-04:00
* native amp

* typo

* imports

* apex
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -77,6 +77,7 @@ references:
      name: Testing Documentation
      command: |
        # Second run examples in docs
+       bash tests/install_AMP.sh
        sudo apt-get update && sudo apt-get install -y cmake
        sudo pip install -r requirements/docs.txt
        cd docs; make doctest; make coverage
diff --git a/pytorch_lightning/core/__init__.py b/pytorch_lightning/core/__init__.py
@@ -149,6 +149,7 @@
 Thus, if we wanted to add a validation loop you would add this to your
 :class:`~LightningModule`:
 
+    >>> import pytorch_lightning as pl
     >>> class LitModel(pl.LightningModule):
     ...     def validation_step(self, batch, batch_idx):
     ...         x, y = batch
@@ -166,6 +167,7 @@
 Add test loop
 ^^^^^^^^^^^^^
 
+    >>> import pytorch_lightning as pl
     >>> class LitModel(pl.LightningModule):
     ...     def test_step(self, batch, batch_idx):
     ...         x, y = batch
@@ -264,6 +266,7 @@ def training_step(self, batch, batch_idx):
 :class:`~LightningModule.prepare_data` method to
 allow for this:
 
+    >>> import pytorch_lightning as pl
     >>> class LitModel(pl.LightningModule):
     ...     def prepare_data(self):
     ...         # download
diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
@@ -4,7 +4,7 @@
 from torch import Tensor
 from torch.nn import Module
 from torch.optim.optimizer import Optimizer
-from pytorch_lightning.utilities import move_data_to_device
+from pytorch_lightning.utilities import move_data_to_device, NATIVE_AMP_AVALAIBLE
 
 
 try:
@@ -189,7 +189,7 @@ def backward(self, trainer, loss, optimizer, optimizer_idx):
         loss.backward()
 
     def amp_scale_loss(self, unscaled_loss, optimizer, optimizer_idx):
-        if self.trainer.use_native_amp:
+        if NATIVE_AMP_AVALAIBLE:
             scaled_loss = self.trainer.scaler.scale(unscaled_loss)
 
         else:
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
@@ -21,7 +21,6 @@
 from pytorch_lightning.core.saving import ModelIO, PRIMITIVE_TYPES, ALLOWED_CONFIG_TYPES
 from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.parsing import AttributeDict, collect_init_args, get_init_args
 
diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
@@ -9,7 +9,8 @@
 import torch.nn as nn
 from torch.utils.hooks import RemovableHandle
 
-import pytorch_lightning as pl
+
+from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 
 PARAMETER_NUM_UNITS = [" ", "K", "M", "B", "T"]
@@ -126,6 +127,7 @@ class ModelSummary(object):
 
     Example::
 
+        >>> import pytorch_lightning as pl
         >>> class LitModel(pl.LightningModule):
         ...
         ...     def __init__(self):
@@ -154,7 +156,7 @@ class ModelSummary(object):
     MODE_DEFAULT = MODE_TOP
     MODES = [MODE_FULL, MODE_TOP]
 
-    def __init__(self, model: "pl.LightningModule", mode: str = MODE_DEFAULT):
+    def __init__(self, model, mode: str = MODE_DEFAULT):
         self._model = model
         self._mode = mode
         self._layer_summary = self.summarize()
@@ -209,7 +211,7 @@ def _forward_example_input(self) -> None:
         input_ = apply_to_collection(input_, torch.Tensor, lambda x: x.type(model.dtype))
 
         if trainer is not None and trainer.use_amp:
-            if model.use_native_amp:
+            if NATIVE_AMP_AVALAIBLE:
                 model.forward = torch.cuda.amp.autocast()(model.forward)
 
         mode = model.training
diff --git a/pytorch_lightning/trainer/auto_mix_precision.py b/pytorch_lightning/trainer/auto_mix_precision.py
@@ -1,52 +1,36 @@
 from abc import ABC
-import torch
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.utilities import rank_zero_warn
-
-try:
-    from apex import amp
-except ImportError:
-    APEX_AVAILABLE = False
-else:
-    APEX_AVAILABLE = True
+from pytorch_lightning.utilities import rank_zero_warn, APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE
+from pytorch_lightning.utilities.distributed import rank_zero_debug
 
 
 class TrainerAMPMixin(ABC):
 
     # this is just a summary on variables used in this abstract class,
     #  the proper values/initialisation should be done in child class
     precision: int
-    use_native_amp: bool
-
-    def init_amp(self, use_amp):
-        if self.use_native_amp:
-            rank_zero_warn("`amp_level` has been deprecated since v0.7.4 (native amp does not require it)"
-                           " and this argument will be removed in v0.9.0", DeprecationWarning)
 
-        # Backward compatibility, TODO: remove in v0.9.0
-        if use_amp is not None:
-            rank_zero_warn("`use_amp` has been replaced by `precision` since v0.7.0"
-                           " and this argument will be removed in v0.9.0", DeprecationWarning)
-            self.precision = 16 if use_amp else 32
+    def init_amp(self):
+        if NATIVE_AMP_AVALAIBLE:
+            log.debug("`amp_level` has been deprecated since v0.7.4 (native amp does not require it)")
 
         assert self.precision in (16, 32), 'only 32 or 16 bit precision supported'
 
-        if use_amp and self.use_native_amp:
-            log.info('Using 16bit precision.')
+        if self.use_amp and NATIVE_AMP_AVALAIBLE:
+            log.info('Using native 16bit precision.')
             return
 
-        # TODO: remove all below for v0.9.0
-        if use_amp and not APEX_AVAILABLE:  # pragma: no-cover
-            raise ModuleNotFoundError("""
-            You set `use_amp=True` but do not have apex installed.
-            Install apex first using this guide and rerun with use_amp=True:
-            https://github.com/NVIDIA/apex#linux
-            this run will NOT use 16 bit precision
-            """)
+        # TODO: replace `use_amp` by `precision` all below for v0.9.0
+        if self.use_amp and not APEX_AVAILABLE:  # pragma: no-cover
+            raise ModuleNotFoundError(
+                "You set `use_amp=True` but do not have apex installed."
+                "Install apex first using this guide and rerun with use_amp=True:"
+                "https://github.com/NVIDIA/apex#linux his run will NOT use 16 bit precision"
+            )
 
         if self.use_amp:
-            log.info('Using 16bit precision.')
+            log.info('Using APEX 16bit precision.')
 
     @property
     def use_amp(self) -> bool:
diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -127,6 +127,7 @@ def train_fx(trial_hparams, cluster_manager, _):
 from pytorch_lightning import _logger as log
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import LightningLoggerBase
+from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn, rank_zero_info
 
@@ -177,7 +178,6 @@ class TrainerDDPMixin(ABC):
     amp_level: str
     use_tpu: bool
     default_root_dir: str
-    use_native_amp: bool
     progress_bar_callback: ...
     num_processes: int
     num_nodes: int
@@ -519,7 +519,7 @@ def ddp_train(self, process_idx, model, is_master=False, proc_offset=0):
         # AMP
         # run through amp wrapper before going to distributed DP
         # TODO: remove with dropping NVIDIA AMP support
-        if self.use_amp and not self.use_native_amp:
+        if self.use_amp and not NATIVE_AMP_AVALAIBLE:
             model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level)
             self.optimizers = optimizers
             self.reinit_scheduler_properties(self.optimizers, self.lr_schedulers)
diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py
@@ -18,7 +18,7 @@
     LightningDistributedDataParallel,
     LightningDataParallel,
 )
-from pytorch_lightning.utilities import move_data_to_device
+from pytorch_lightning.utilities import move_data_to_device, NATIVE_AMP_AVALAIBLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.distributed import rank_zero_only
 
@@ -61,7 +61,6 @@ class TrainerDPMixin(ABC):
     tpu_local_core_rank: int
     tpu_global_core_rank: int
     use_tpu: bool
-    use_native_amp: bool
     data_parallel_device_ids: ...
     progress_bar_callback: ...
     tpu_id: Optional[int]
@@ -175,7 +174,7 @@ def single_gpu_train(self, model):
         self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)
 
         # TODO: remove with dropping NVIDIA AMP support
-        if self.use_amp and not self.use_native_amp:
+        if self.use_amp and not NATIVE_AMP_AVALAIBLE:
             # An example
             model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level)
             self.optimizers = optimizers
@@ -236,14 +235,14 @@ def dp_train(self, model):
 
         # hack forward to do autocast for the user
         model_autocast_original_forward = model.forward
-        if self.use_amp and self.use_native_amp:
+        if self.use_amp and NATIVE_AMP_AVALAIBLE:
             # wrap the user's forward in autocast and give it back at the end
             model.forward = torch.cuda.amp.autocast()(model.forward)
 
         # TODO: remove with dropping NVIDIA AMP support
         # check for this bug (amp + dp + !01 doesn't work)
         # https://github.com/NVIDIA/apex/issues/227
-        if self.use_dp and self.use_amp and not self.use_native_amp:
+        if self.use_dp and self.use_amp and not NATIVE_AMP_AVALAIBLE:
             if self.amp_level == 'O2':
                 raise MisconfigurationException(
                     f'Amp level {self.amp_level} with DataParallel is not supported.'
diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
@@ -124,15 +124,14 @@
 
 from abc import ABC, abstractmethod
 from pprint import pprint
-from typing import Callable, Optional, List, Union
+from typing import Callable, List, Union
 
 import torch
 from torch.utils.data import DataLoader
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel, LightningDataParallel
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities import rank_zero_warn, NATIVE_AMP_AVALAIBLE
 
 try:
     import torch_xla.distributed.parallel_loader as xla_pl
@@ -285,7 +284,7 @@ def _evaluate(
                 # -----------------
                 # RUN EVALUATION STEP
                 # -----------------
-                if self.use_amp and self.use_native_amp:
+                if self.use_amp and NATIVE_AMP_AVALAIBLE:
                     with torch.cuda.amp.autocast():
                         output = self.evaluation_forward(model, batch, batch_idx, dataloader_idx, test_mode)
                 else:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -13,7 +13,7 @@
 from pytorch_lightning.core.memory import ModelSummary
 from pytorch_lightning.loggers import LightningLoggerBase
 from pytorch_lightning.profiler import SimpleProfiler, PassThroughProfiler, BaseProfiler
-from pytorch_lightning.trainer.auto_mix_precision import TrainerAMPMixin
+from pytorch_lightning.trainer.auto_mix_precision import TrainerAMPMixin, NATIVE_AMP_AVALAIBLE
 from pytorch_lightning.trainer.callback_config import TrainerCallbackConfigMixin
 from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin
 from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin
@@ -532,12 +532,17 @@ def __init__(
         # These are the only lines needed after v0.8.0
         # we wrap the user's forward with autocast and give it back at the end of fit
         self.autocast_original_forward = None
-        self.use_native_amp = hasattr(torch.cuda, "amp") and hasattr(torch.cuda.amp, "autocast")
         self.precision = precision
         self.scaler = None
 
+        # Backward compatibility, TODO: remove in v0.9.0
+        if use_amp is not None:
+            rank_zero_warn("Argument `use_amp` is now set by `precision` since v0.7.0"
+                           " and this method will be removed in v0.9.0", DeprecationWarning)
+            self.precision = 16 if use_amp else 32
+
         self.amp_level = amp_level
-        self.init_amp(use_amp)
+        self.init_amp()
 
         self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE')
 
@@ -1002,7 +1007,7 @@ def run_pretrain_routine(self, model: LightningModule):
         self.copy_trainer_model_properties(ref_model)
 
         # init amp. Must be done here instead of __init__ to allow ddp to work
-        if self.use_native_amp and self.precision == 16:
+        if NATIVE_AMP_AVALAIBLE and self.precision == 16:
             self.scaler = torch.cuda.amp.GradScaler()
 
         # log hyper-parameters
diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
@@ -100,7 +100,7 @@
     LightningDistributedDataParallel,
     LightningDataParallel,
 )
-from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities import rank_zero_warn, NATIVE_AMP_AVALAIBLE
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 
 try:
@@ -147,7 +147,6 @@ class TrainerIOMixin(ABC):
     num_training_batches: int
     accumulate_grad_batches: int
     use_amp: bool
-    use_native_amp: bool
     scaler: ...
 
     def get_model(self):
@@ -307,7 +306,7 @@ def restore(self, checkpoint_path: str, on_gpu: bool):
             model.cuda(self.root_gpu)
 
         # restore amp scaling
-        if self.use_amp and self.use_native_amp and 'native_amp_scaling_state' in checkpoint:
+        if self.use_amp and NATIVE_AMP_AVALAIBLE and 'native_amp_scaling_state' in checkpoint:
             self.scaler.load_state_dict(checkpoint['native_amp_scaling_state'])
 
         # load training state (affects trainer only)
@@ -352,7 +351,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
             checkpoint['lr_schedulers'] = lr_schedulers
 
             # save native amp scaling
-            if self.use_amp and self.use_native_amp:
+            if self.use_amp and NATIVE_AMP_AVALAIBLE:
                 checkpoint['native_amp_scaling_state'] = self.scaler.state_dict()
 
         # add the module_arguments and state_dict from the model
@@ -502,7 +501,7 @@ def hpc_load(self, folderpath, on_gpu):
         model.load_state_dict(checkpoint['state_dict'])
 
         # restore amp scaling
-        if self.use_amp and self.use_native_amp and 'native_amp_scaling_state' in checkpoint:
+        if self.use_amp and NATIVE_AMP_AVALAIBLE and 'native_amp_scaling_state' in checkpoint:
             self.scaler.load_state_dict(checkpoint['native_amp_scaling_state'])
 
         if self.root_gpu is not None:
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
@@ -160,7 +160,7 @@ def training_step(self, batch, batch_idx):
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.loggers import LightningLoggerBase
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
-from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities import rank_zero_warn, NATIVE_AMP_AVALAIBLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.parsing import AttributeDict
 from pytorch_lightning.utilities.memory import recursive_detach
@@ -709,7 +709,7 @@ def run_batch_backward_pass(self, split_batch, batch_idx, opt_idx, optimizer):
         # ------------------
         # CLIP GRADS
         # ------------------
-        if self.use_amp and self.use_native_amp:
+        if self.use_amp and NATIVE_AMP_AVALAIBLE:
             self.scaler.unscale_(optimizer)
         self.clip_gradients()
 
@@ -743,7 +743,7 @@ def call_optimizer_step(self, optimizer, opt_idx, batch_idx, split_batch):
             elif isinstance(optimizer, torch.optim.LBFGS):
 
                 # native amp + lbfgs is a no go right now
-                if self.use_amp and self.use_native_amp:
+                if self.use_amp and NATIVE_AMP_AVALAIBLE:
                     raise MisconfigurationException(
                         'native PyTorch amp and lbfgs are not compatible.'
                         ' To request, please file a Github issue in PyTorch and tag @mcarilli')
@@ -752,11 +752,11 @@ def call_optimizer_step(self, optimizer, opt_idx, batch_idx, split_batch):
 
             # when using 16-bit
             else:
-                native_amp = self.use_amp and self.use_native_amp
+                native_amp = self.use_amp and NATIVE_AMP_AVALAIBLE
                 model.optimizer_step(self.current_epoch, batch_idx, optimizer, opt_idx, lambda_closure, native_amp)
 
             # in native 16-bit we need to update scaler after optimizer step
-            if self.use_amp and self.use_native_amp:
+            if self.use_amp and NATIVE_AMP_AVALAIBLE:
                 self.scaler.update()
 
             # model hook
@@ -773,7 +773,7 @@ def optimizer_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens)
         # FORWARD
         # ---------------------------
         with self.profiler.profile('model_forward'):
-            if self.use_amp and self.use_native_amp:
+            if self.use_amp and NATIVE_AMP_AVALAIBLE:
                 with torch.cuda.amp.autocast():
                     training_step_output = self.training_forward(split_batch, batch_idx,
                                                                  opt_idx, hiddens)
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py