Address review comments.

Signed-off-by: Josh Romero <joshr@nvidia.com>
horovod · Nov 19, 2020 · 9b3ce49 · 9b3ce49
1 parent 5d24fff
commit 9b3ce49
Show file tree

Hide file tree

Showing 6 changed files with 18 additions and 45 deletions.
diff --git a/horovod/common/group_table.cc b/horovod/common/group_table.cc
@@ -40,27 +40,6 @@ bool GroupTable::empty(void) const {
   return tensor_name_to_id_.empty();
 }
 
-int32_t GroupTable::RegisterGroup(const std::vector<std::string>& tensor_names) {
-  std::lock_guard<std::mutex> guard(mutex_);
-
-  int32_t group_id;
-  if (!free_ids_.empty()) {
-    // Reuse old group_id
-    group_id = free_ids_.front();
-    free_ids_.pop();
-  } else {
-    // Create a new group_id
-    group_id = next_group_id_++;
-  }
-
-  for (auto& name : tensor_names) {
-    tensor_name_to_id_[name] = group_id;
-  }
-  id_to_tensor_names_[group_id] = tensor_names;
-
-  return group_id;
-}
-
 int32_t GroupTable::RegisterGroup(std::vector<std::string>&& tensor_names) {
   std::lock_guard<std::mutex> guard(mutex_);
 
@@ -82,16 +61,6 @@ int32_t GroupTable::RegisterGroup(std::vector<std::string>&& tensor_names) {
   return group_id;
 }
 
-void GroupTable::DeregisterGroup(int32_t group_id) {
-  std::lock_guard<std::mutex> guard(mutex_);
-  for (auto& entry : id_to_tensor_names_[group_id]) {
-    tensor_name_to_id_.erase(entry);
-  }
-  id_to_tensor_names_.erase(group_id);
-
-  free_ids_.push(group_id);
-}
-
 void GroupTable::DeregisterGroups(const std::vector<std::string>& tensor_names) {
   std::lock_guard<std::mutex> guard(mutex_);
 

diff --git a/horovod/common/group_table.h b/horovod/common/group_table.h
@@ -36,9 +36,7 @@ class GroupTable {
   const std::vector<std::string>& GetGroupTensorNames(int32_t group_id) const;
   bool empty(void) const;
 
-  int32_t RegisterGroup(const std::vector<std::string>& tensor_names);
   int32_t RegisterGroup(std::vector<std::string>&& tensor_names);
-  void DeregisterGroup(int32_t group_id);
   void DeregisterGroups(const std::vector<std::string>& tensor_names);
 
 private:

diff --git a/horovod/common/util.py b/horovod/common/util.py
@@ -244,3 +244,10 @@ def num_rank_is_power_2(num_rank):
     TODO support non-power of 2 ranks.
     """
     return num_rank != 0 and ((num_rank & (num_rank -1)) == 0)
+
+def split_list(l, n):
+    """
+    Splits list l into n approximately even sized chunks.
+    """
+    d, r = divmod(len(l), n)
+    return [l[i * d + min(i, r):(i + 1) * d + min(i + 1, r)] for i in range(n)]
diff --git a/horovod/mxnet/__init__.py b/horovod/mxnet/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-from horovod.common.util import check_extension
+from horovod.common.util import check_extension, split_list
 
 check_extension('horovod.mxnet', 'HOROVOD_WITH_MXNET',
                 __file__, 'mpi_lib')
@@ -60,9 +60,8 @@ def _do_allreduce(self, index, grad):
 
         if isinstance(index, (tuple, list)):
             if (self._num_groups > 0):
-                d, r = divmod(len(grad), self._num_groups)
-                grad_split = [grad[n * d + min(n, r):(n + 1) * d + min(n + 1, r)] for n in range(self._num_groups)]
-                index_split = [index[n * d + min(n, r):(n + 1) * d + min(n + 1, r)] for n in range(self._num_groups)]
+                grad_split = split_list(grad, self._num_groups)
+                index_split = split_list(index, self._num_groups)
 
                 for i, (grads, indices) in enumerate(zip(grad_split, index_split)):
                     grouped_allreduce_(tensors=grads, average=False, name="{}:{}".format(indices[0], indices[-1]), priority=-i,
@@ -157,9 +156,8 @@ def _allreduce_grads(self):
                     grads.append(param.list_grad()[0])
                     names.append(self._prefix + str(i))
 
-            d, r = divmod(len(grads), self._num_groups)
-            grads_split = [grads[n * d + min(n, r):(n + 1) * d + min(n + 1, r)] for n in range(self._num_groups)]
-            names_split = [names[n * d + min(n, r):(n + 1) * d + min(n + 1, r)] for n in range(self._num_groups)]
+            grads_split = split_list(grads, self._num_groups)
+            names_split = split_list(names, self._num_groups)
 
             for i, (group_grads, group_names) in enumerate(zip(grads_split, names_split)):
                 # For better performance, enqueue groups in separate grouped_allreduce calls by dtype.

diff --git a/horovod/tensorflow/__init__.py b/horovod/tensorflow/__init__.py
@@ -19,7 +19,7 @@
 import os
 import warnings
 
-from horovod.common.util import check_extension, gpu_available
+from horovod.common.util import check_extension, gpu_available, split_list
 
 check_extension('horovod.tensorflow', 'HOROVOD_WITH_TENSORFLOW', __file__, 'mpi_lib')
 
@@ -352,8 +352,7 @@ def allreduce_grads(grads):
 
             if num_groups > 0:
                 grads_clean = [grad for grad in grads if grad is not None]
-                d, r = divmod(len(grads_clean), num_groups)
-                grads_split = [grads_clean[n * d + min(n, r):(n + 1) * d + min(n + 1, r)] for n in range(num_groups)]
+                grads_split = split_list(grads_clean, num_groups)
 
                 reduce_ops = []
                 for group in grads_split:

diff --git a/horovod/torch/optimizer.py b/horovod/torch/optimizer.py
@@ -21,6 +21,8 @@
 
 import torch
 
+from horovod.common.util import split_list
+
 from horovod.torch.compression import Compression
 from horovod.torch.functions import broadcast_object
 from horovod.torch.mpi_ops import allreduce_async_, grouped_allreduce_async_
@@ -122,8 +124,8 @@ def _register_hooks(self):
             p_list = sorted(p_list, key=lambda p : p_list_names.index(self._parameter_names.get(p)))
 
             # Form groups
-            d, r = divmod(len(p_list), self._num_groups)
-            p_groups = [tuple(p_list[n * d + min(n, r):(n + 1) * d + min(n + 1, r)]) for n in range(self._num_groups)]
+            p_groups = split_list(p_list, self._num_groups)
+            p_groups = [tuple(p) for p in p_groups]
             for group in p_groups:
               for p in group:
                 self._p_to_group[p] = group