horovod · Tixxx · Apr 30, 2022 · Apr 25, 2022 · Apr 25, 2022 · Apr 27, 2022
diff --git a/horovod/spark/common/params.py b/horovod/spark/common/params.py
@@ -96,6 +96,14 @@ class EstimatorParams(Params):
 
     label_shapes = Param(Params._dummy(), 'label_shapes', 'specifies the shape (or shapes) of the label column (or columns)')
 
+    inmemory_cache_all = Param(Params._dummy(), 'inmemory_cache_all',
+                               'Cache the data in memory for training and validation.',
+                               typeConverter=TypeConverters.toBoolean)
+
+    pin_gpu = Param(Params._dummy(), 'pin_gpu',
+                               'Whether to pin the traininig process to the GPU. Defaults to True.',
+                               typeConverter=TypeConverters.toBoolean)
+
     def __init__(self):
         super(EstimatorParams, self).__init__()
 
@@ -129,7 +137,9 @@ def __init__(self):
             train_reader_num_workers=2,
             val_reader_num_workers=2,
             reader_pool_type='process',
-            label_shapes=None)
+            label_shapes=None,
+            inmemory_cache_all=False,
+            pin_gpu=True)
 
     def _check_params(self, metadata):
         model = self.getModel()
@@ -334,6 +344,17 @@ def setLabelShapes(self, value):
     def getLabelShapes(self):
         return self.getOrDefault(self.label_shapes)
 
+    def setInMemoryCacheAll(self, value):
+        return self._set(inmemory_cache_all=value)
+
+    def getInMemoryCacheAll(self):
+        return self.getOrDefault(self.inmemory_cache_all)
+
+    def setPinGpu(self, value):
+        self._set(pin_gpu=value)
+
+    def getPinGpu(self):
+        return self.getOrDefault(self.pin_gpu)
 
 class ModelParams(HasOutputCols):
     history = Param(Params._dummy(), 'history', 'history')

diff --git a/horovod/spark/keras/estimator.py b/horovod/spark/keras/estimator.py
@@ -147,14 +147,12 @@ class KerasEstimator(HorovodEstimator, KerasEstimatorParamsReadable,
         inmemory_cache_all: boolean value. Cache the data in memory for training and validation. Default: False.
         backend_env: dict to add to the environment of the backend.  Defaults to setting the java heap size to
                      2G min and max for libhdfs through petastorm
+        pin_gpu: Whether to pin the traininig process to the GPU. Defaults to True.
     """
 
     custom_objects = Param(Params._dummy(), 'custom_objects', 'custom objects')
     checkpoint_callback = Param(Params._dummy(), 'checkpoint_callback',
                                 'model checkpointing callback')
-    inmemory_cache_all = Param(Params._dummy(), 'inmemory_cache_all',
-                               'Cache the data in memory for training and validation.',
-                               typeConverter=TypeConverters.toBoolean)
     backend_env = Param(Params._dummy(), "backend_env",
                         "dict to add to the environment of the command run on the environment")
 
@@ -192,14 +190,14 @@ def __init__(self,
                  label_shapes=None,
                  checkpoint_callback=None,
                  inmemory_cache_all=False,
-                 backend_env=None):
+                 backend_env=None,
+                 pin_gpu=True):
 
         super(KerasEstimator, self).__init__()
 
         self._setDefault(optimizer=None,
                          custom_objects={},
                          checkpoint_callback=None,
-                         inmemory_cache_all=False,
                          backend_env={'LIBHDFS_OPTS': '-Xms2048m -Xmx2048m'})
 
         kwargs = self._input_kwargs
@@ -235,12 +233,6 @@ def setCheckpointCallback(self, value):
     def getCheckpointCallback(self):
         return self.getOrDefault(self.checkpoint_callback)
 
-    def setInMemoryCacheAll(self, value):
-        return self._set(inmemory_cache_all=value)
-
-    def getInMemoryCacheAll(self):
-        return self.getOrDefault(self.inmemory_cache_all)
-
     def setBackendEnv(self, value):
         self._set(backend_env=value)
 

diff --git a/horovod/spark/keras/remote.py b/horovod/spark/keras/remote.py
@@ -52,6 +52,7 @@ def RemoteTrainer(estimator, metadata, keras_utils, run_id, dataset_idx):
     user_verbose = estimator.getVerbose()
     checkpoint_callback = estimator.getCheckpointCallback()
     inmemory_cache_all = estimator.getInMemoryCacheAll()
+    should_pin_gpu = estimator.getPinGpu()
 
     # Data reader parameters
     train_reader_worker_count = estimator.getTrainReaderNumWorker()
@@ -111,7 +112,16 @@ def train(serialized_model, train_rows, val_rows, avg_row_size):
         hvd = get_horovod()
         hvd.init()
 
-        pin_gpu(hvd, tf, k)
+        # Verbose mode 1 will print a progress bar
+        verbose = user_verbose if hvd.rank() == 0 else 0
+
+        if should_pin_gpu:
+            if verbose:
+                print(f"Pinning current process to the GPU.")
+            pin_gpu(hvd, tf, k)
+        else:
+            if verbose:
+                print(f"Skip pinning current process to the GPU.")
 
         if random_seed is not None:
             if LooseVersion(tf.__version__) < LooseVersion('2.0.0'):
@@ -137,8 +147,6 @@ def train(serialized_model, train_rows, val_rows, avg_row_size):
         scaled_lr = k.backend.get_value(model.optimizer.lr) * hvd.size()
         k.backend.set_value(model.optimizer.lr, scaled_lr)
 
-        # Verbose mode 1 will print a progress bar
-        verbose = user_verbose if hvd.rank() == 0 else 0
 
         if verbose:
             print(f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}")

diff --git a/horovod/spark/lightning/estimator.py b/horovod/spark/lightning/estimator.py
@@ -181,6 +181,7 @@ class TorchEstimator(HorovodEstimator, TorchEstimatorParamsWritable,
         debug_data_loader: (Optional)Debugging flag for data loader.
         train_async_data_loader_queue_size: (Optional) Size of train async data loader queue.
         val_async_data_loader_queue_size: (Optional) Size of val async data loader queue.
+        pin_gpu: Whether to pin the traininig process to the GPU. Defaults to True.
     """
 
     input_shapes = Param(Params._dummy(), 'input_shapes', 'input layer shapes')
@@ -189,10 +190,6 @@ class TorchEstimator(HorovodEstimator, TorchEstimatorParamsWritable,
     train_minibatch_fn = Param(Params._dummy(), 'train_minibatch_fn',
                                'functions that construct the minibatch train function for torch')
 
-    inmemory_cache_all = Param(Params._dummy(), 'inmemory_cache_all',
-                               'Cache the data in memory for training and validation.',
-                               typeConverter=TypeConverters.toBoolean)
-
     num_gpus = Param(Params._dummy(), 'num_gpus',
                      'Number of gpus per process, default to 1 when CUDA is available in the backend, otherwise 0.')
 
@@ -266,14 +263,14 @@ def __init__(self,
                  profiler=None,
                  debug_data_loader=False,
                  train_async_data_loader_queue_size=None,
-                 val_async_data_loader_queue_size=None):
+                 val_async_data_loader_queue_size=None,
+                 pin_gpu=True):
 
         super(TorchEstimator, self).__init__()
         self._setDefault(loss_constructors=None,
                          input_shapes=None,
                          train_minibatch_fn=None,
                          transformation_fn=None,
-                         inmemory_cache_all=False,
                          num_gpus=None,
                          logger=None,
                          log_every_n_steps=50,
@@ -315,12 +312,6 @@ def setLossConstructors(self, value):
     def getLossConstructors(self):
         return self.getOrDefault(self.loss_constructors)
 
-    def setInMemoryCacheAll(self, value):
-        return self._set(inmemory_cache_all=value)
-
-    def getInMemoryCacheAll(self):
-        return self.getOrDefault(self.inmemory_cache_all)
-
     def setNumGPUs(self, value):
         return self._set(num_gpus=value)
 

diff --git a/horovod/spark/lightning/remote.py b/horovod/spark/lightning/remote.py
@@ -64,6 +64,7 @@ def RemoteTrainer(estimator, metadata, ckpt_bytes, run_id, dataset_idx, train_ro
     debug_data_loader = estimator.getDebugDataLoader()
     train_async_data_loader_queue_size = estimator.getTrainAsyncDataLoaderQueueSize()
     val_async_data_loader_queue_size = estimator.getValAsyncDataLoaderQueueSize()
+    should_pin_gpu = estimator.getPinGpu()
 
     # get logger
     logger = estimator.getLogger()
@@ -194,7 +195,10 @@ def on_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
                       f"Val rows: {val_rows}, Val batch size: {val_batch_size}, Val_steps_per_epoch: {_val_steps_per_epoch}\n"
                       f"Checkpoint file: {remote_store.checkpoint_path}, Logs dir: {remote_store.logs_path}\n")
 
-            cuda_available = torch.cuda.is_available()
+            if not should_pin_gpu and verbose:
+                print("Skip pinning current process to the GPU.")
+
+            cuda_available = torch.cuda.is_available() and should_pin_gpu
             # We need to check all ranks have same device type for traning.
             # Horovod doesn't support heterogeneous allreduce for gradients.
             cuda_avail_list = hvd.allgather_object(cuda_available, name='device type')

diff --git a/horovod/spark/torch/estimator.py b/horovod/spark/torch/estimator.py
@@ -147,6 +147,8 @@ class TorchEstimator(HorovodEstimator, TorchEstimatorParamsWritable,
         val_reader_num_workers: Similar to the train_reader_num_workers.
         reader_pool_type: Type of worker pool used to parallelize reading data from the dataset.
                           Should be one of ['thread', 'process']. Defaults to 'process'.
+        inmemory_cache_all: (Optional) Cache the data in memory for training and validation.
+        pin_gpu: Whether to pin the traininig process to the GPU. Defaults to True.
     """
 
     input_shapes = Param(Params._dummy(), 'input_shapes', 'input layer shapes')
@@ -155,10 +157,6 @@ class TorchEstimator(HorovodEstimator, TorchEstimatorParamsWritable,
     train_minibatch_fn = Param(Params._dummy(), 'train_minibatch_fn',
                                'functions that construct the minibatch train function for torch')
 
-    inmemory_cache_all = Param(Params._dummy(), 'inmemory_cache_all',
-                               'Cache the data in memory for training and validation.',
-                               typeConverter=TypeConverters.toBoolean)
-
     @keyword_only
     def __init__(self,
                  num_proc=None,
@@ -193,14 +191,14 @@ def __init__(self,
                  val_reader_num_workers=None,
                  reader_pool_type=None,
                  label_shapes=None,
-                 inmemory_cache_all=False):
+                 inmemory_cache_all=False,
+                 pin_gpu=True):
 
         super(TorchEstimator, self).__init__()
         self._setDefault(loss_constructors=None,
                          input_shapes=None,
                          train_minibatch_fn=None,
-                         transformation_fn=None,
-                         inmemory_cache_all=False)
+                         transformation_fn=None)
 
         kwargs = self._input_kwargs
 
@@ -227,12 +225,6 @@ def setLossConstructors(self, value):
     def getLossConstructors(self):
         return self.getOrDefault(self.loss_constructors)
 
-    def setInMemoryCacheAll(self, value):
-        return self._set(inmemory_cache_all=value)
-
-    def getInMemoryCacheAll(self):
-        return self.getOrDefault(self.inmemory_cache_all)
-
     def _get_optimizer(self):
         return self.getOrDefault(self.optimizer)
 

diff --git a/horovod/spark/torch/remote.py b/horovod/spark/torch/remote.py
@@ -60,6 +60,7 @@ def RemoteTrainer(estimator, metadata, last_checkpoint_state, run_id, dataset_id
     transformation_fn = estimator.getTransformationFn()
     transformation = transformation_fn if transformation_fn else None
     inmemory_cache_all = estimator.getInMemoryCacheAll()
+    should_pin_gpu = estimator.getPinGpu()
 
     # If loss weight is not provided, use equal loss for all the labels
     loss_weights = estimator.getLossWeights()
@@ -134,7 +135,10 @@ def train(serialized_model, optimizer_cls, model_opt_state_serialized,
                 raise ValueError("user_shuffle_buffer_size cannot be negative!")
             shuffle_buffer_size = user_shuffle_buffer_size
 
-        cuda_available = torch.cuda.is_available()
+        if not should_pin_gpu and user_verbose:
+            print("Skip pinning current process to the GPU.")
+
+        cuda_available = torch.cuda.is_available() and should_pin_gpu
         # We need to check all ranks have same device type for traning.
         # Horovod doesn't support heterogeneous allreduce for gradients.
         cuda_avail_list = hvd.allgather_object(cuda_available, name='device type')

diff --git a/test/integration/test_spark_keras.py b/test/integration/test_spark_keras.py
@@ -98,7 +98,10 @@ def test_fit_model(self):
                     batch_size=1,
                     random_seed=1,
                     epochs=3,
-                    verbose=2)
+                    verbose=2,
+                    pin_gpu=False)
+
+                assert not keras_estimator.getPinGpu()
 
                 keras_model = keras_estimator.fit(df)