Skip to content

Commit

Permalink
extend uint8 and int8 allreduce tests to xla and process sets
Browse files Browse the repository at this point in the history
Signed-off-by: Vignesh Kothapalli <k.vignesh1420@gmail.com>
  • Loading branch information
kvignesh1420 committed Aug 16, 2022
1 parent b7d450f commit 83035ba
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 46 deletions.
60 changes: 36 additions & 24 deletions test/parallel/test_tensorflow_process_sets.py
Expand Up @@ -88,24 +88,28 @@ def test_horovod_allreduce_cpu_process_sets(self):
even_set = hvd.add_process_set(even_ranks)
odd_set = hvd.add_process_set(odd_ranks)

dtypes = self.filter_supported_types([tf.int32, tf.int64, tf.float16, tf.float32, tf.float64])
dtypes = self.filter_supported_types([tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64])
dims = [1, 2, 3]
for dtype, dim in itertools.product(dtypes, dims):
with tf.device("/cpu:0"):
even_rank_tensor = self.random_uniform([17] * dim, -100, 100, dtype=dtype)
odd_rank_tensor = self.random_uniform([17] * dim, -100, 100, dtype=dtype)
even_rank_tensor = self.random_uniform([17] * dim, -100, 100)
even_rank_tensor = tf.cast(even_rank_tensor, dtype=dtype)
odd_rank_tensor = self.random_uniform([17] * dim, -100, 100)
odd_rank_tensor = tf.cast(odd_rank_tensor, dtype=dtype)
if rank in even_ranks:
summed = hvd.allreduce(even_rank_tensor, average=False, process_set=even_set)
multiplied = even_rank_tensor * len(even_ranks)
if rank in odd_ranks:
summed = hvd.allreduce(odd_rank_tensor, average=False, process_set=odd_set)
multiplied = odd_rank_tensor * len(odd_ranks)
max_difference = tf.reduce_max(tf.abs(summed - multiplied))
difference = summed - multiplied
difference = tf.cast(difference, tf.int32) if dtype == tf.uint8 else difference
max_difference = tf.reduce_max(tf.abs(difference))

# Threshold for floating point equality depends on number of
# ranks, since we're comparing against precise multiplication.
max_process_set_size = max(len(even_ranks), len(odd_ranks))
if max_process_set_size <= 3 or dtype in [tf.int32, tf.int64]:
if max_process_set_size <= 3 or dtype in [tf.uint8, tf.int8, tf.int32, tf.int64]:
threshold = 0
elif max_process_set_size < 10:
threshold = 1e-4
Expand Down Expand Up @@ -141,24 +145,28 @@ def test_horovod_allreduce_gpu_process_sets(self):
even_set = hvd.add_process_set(even_ranks)
odd_set = hvd.add_process_set(odd_ranks)

dtypes = [tf.int32, tf.int64, tf.float16, tf.float32, tf.float64]
dtypes = [tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64]
dims = [1, 2, 3]
for dtype, dim in itertools.product(dtypes, dims):
with tf.device("/gpu:%d" % local_rank):
even_rank_tensor = self.random_uniform([17] * dim, -100, 100, dtype=dtype)
odd_rank_tensor = self.random_uniform([17] * dim, -100, 100, dtype=dtype)
even_rank_tensor = self.random_uniform([17] * dim, -100, 100)
even_rank_tensor = tf.cast(even_rank_tensor, dtype=dtype)
odd_rank_tensor = self.random_uniform([17] * dim, -100, 100)
odd_rank_tensor = tf.cast(odd_rank_tensor, dtype=dtype)
if rank in even_ranks:
summed = hvd.allreduce(even_rank_tensor, average=False, process_set=even_set)
multiplied = even_rank_tensor * len(even_ranks)
if rank in odd_ranks:
summed = hvd.allreduce(odd_rank_tensor, average=False, process_set=odd_set)
multiplied = odd_rank_tensor * len(odd_ranks)
max_difference = tf.reduce_max(tf.abs(summed - multiplied))
difference = summed - multiplied
difference = tf.cast(difference, tf.int32) if dtype == tf.uint8 else difference
max_difference = tf.reduce_max(tf.abs(difference))

# Threshold for floating point equality depends on number of
# ranks, since we're comparing against precise multiplication.
max_process_set_size = max(len(even_ranks), len(odd_ranks))
if max_process_set_size <= 3 or dtype in [tf.int32, tf.int64]:
if max_process_set_size <= 3 or dtype in [tf.uint8, tf.int8, tf.int32, tf.int64]:
threshold = 0
elif max_process_set_size < 10:
threshold = 1e-4
Expand Down Expand Up @@ -285,26 +293,28 @@ def test_horovod_grouped_allreduce_cpu_process_sets(self):
even_set = hvd.add_process_set(even_ranks)
odd_set = hvd.add_process_set(odd_ranks)

dtypes = self.filter_supported_types([tf.int32, tf.int64, tf.float16, tf.float32, tf.float64])
dtypes = self.filter_supported_types([tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64])
dims = [1, 2, 3]
for dtype, dim in itertools.product(dtypes, dims):
with tf.device("/cpu:0"):
even_rank_tensors = [self.random_uniform(
[17] * dim, -100, 100, dtype=dtype) for _ in range(5)]
odd_rank_tensors = [self.random_uniform(
[17] * dim, -100, 100, dtype=dtype) for _ in range(5)]
even_rank_tensors = [tf.cast(self.random_uniform(
[17] * dim, -100, 100), dtype=dtype) for _ in range(5)]
odd_rank_tensors = [tf.cast(self.random_uniform(
[17] * dim, -100, 100), dtype=dtype) for _ in range(5)]
if rank in even_ranks:
summed = hvd.grouped_allreduce(even_rank_tensors, average=False, process_set=even_set)
multiplied = [tensor * len(even_ranks) for tensor in even_rank_tensors]
elif rank in odd_ranks:
summed = hvd.grouped_allreduce(odd_rank_tensors, average=False, process_set=odd_set)
multiplied = [tensor * len(odd_ranks) for tensor in odd_rank_tensors]
max_difference = tf.reduce_max([tf.reduce_max(tf.abs(t1 - t2)) for t1, t2 in zip(summed, multiplied)])
differences = [t1 - t2 for t1, t2 in zip(summed, multiplied)]
differences = [tf.cast(diff, tf.int32) if dtype == tf.uint8 else diff for diff in differences]
max_difference = tf.reduce_max([tf.reduce_max(tf.abs(diff)) for diff in differences])

# Threshold for floating point equality depends on number of
# ranks, since we're comparing against precise multiplication.
max_process_set_size = max(len(even_ranks), len(odd_ranks))
if max_process_set_size <= 3 or dtype in [tf.int32, tf.int64]:
if max_process_set_size <= 3 or dtype in [tf.uint8, tf.int8, tf.int32, tf.int64]:
threshold = 0
elif max_process_set_size < 10:
threshold = 1e-4
Expand Down Expand Up @@ -337,26 +347,28 @@ def test_horovod_grouped_allreduce_gpu_process_sets(self):
even_set = hvd.add_process_set(even_ranks)
odd_set = hvd.add_process_set(odd_ranks)

dtypes = self.filter_supported_types([tf.int32, tf.int64, tf.float16, tf.float32, tf.float64])
dtypes = self.filter_supported_types([tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64])
dims = [1, 2, 3]
for dtype, dim in itertools.product(dtypes, dims):
with tf.device("/gpu:%d" % local_rank):
even_rank_tensors = [self.random_uniform(
[17] * dim, -100, 100, dtype=dtype) for _ in range(5)]
odd_rank_tensors = [self.random_uniform(
[17] * dim, -100, 100, dtype=dtype) for _ in range(5)]
even_rank_tensors = [tf.cast(self.random_uniform(
[17] * dim, -100, 100), dtype=dtype) for _ in range(5)]
odd_rank_tensors = [tf.cast(self.random_uniform(
[17] * dim, -100, 100), dtype=dtype) for _ in range(5)]
if rank in even_ranks:
summed = hvd.grouped_allreduce(even_rank_tensors, average=False, process_set=even_set)
multiplied = [tensor * len(even_ranks) for tensor in even_rank_tensors]
elif rank in odd_ranks:
summed = hvd.grouped_allreduce(odd_rank_tensors, average=False, process_set=odd_set)
multiplied = [tensor * len(odd_ranks) for tensor in odd_rank_tensors]
max_difference = tf.reduce_max([tf.reduce_max(tf.abs(t1 - t2)) for t1, t2 in zip(summed, multiplied)])
differences = [t1 - t2 for t1, t2 in zip(summed, multiplied)]
differences = [tf.cast(diff, tf.int32) if dtype == tf.uint8 else diff for diff in differences]
max_difference = tf.reduce_max([tf.reduce_max(tf.abs(diff)) for diff in differences])

# Threshold for floating point equality depends on number of
# ranks, since we're comparing against precise multiplication.
max_process_set_size = max(len(even_ranks), len(odd_ranks))
if max_process_set_size <= 3 or dtype in [tf.int32, tf.int64]:
if max_process_set_size <= 3 or dtype in [tf.uint8, tf.int8, tf.int32, tf.int64]:
threshold = 0
elif max_process_set_size < 10:
threshold = 1e-4
Expand Down
36 changes: 21 additions & 15 deletions test/parallel/test_xla.py
Expand Up @@ -111,14 +111,16 @@ def test_horovod_allreduce_gpu(self):
size = hvd.size()

def hvd_allreduce_test(self, dtype, dim):
tensor = self.random_uniform(
[17] * dim, -100, 100, dtype=dtype)
tensor = self.random_uniform([17] * dim, -100, 100)
tensor = tf.cast(tensor, dtype=dtype)
summed = hvd.allreduce(tensor, average=False)
multiplied = tensor * size
max_difference = tf.reduce_max(tf.abs(summed - multiplied))
difference = summed - multiplied
difference = tf.cast(difference, tf.int32) if dtype == tf.uint8 else difference
max_difference = tf.reduce_max(tf.abs(difference))
return max_difference

dtypes = [tf.int32, tf.int64, tf.float32, tf.float16, tf.float64]
dtypes = [tf.uint8, tf.int8, tf.int32, tf.int64, tf.float32, tf.float16, tf.float64]
dims = [1, 2, 3]
for dtype, dim in itertools.product(dtypes, dims):
with tf.device("/gpu:%d" % local_rank):
Expand All @@ -127,7 +129,7 @@ def hvd_allreduce_test(self, dtype, dim):

# Threshold for floating point equality depends on number of
# ranks, since we're comparing against precise multiplication.
if size <= 3 or dtype in [tf.int32, tf.int64]:
if size <= 3 or dtype in [tf.uint8, tf.int8, tf.int32, tf.int64]:
threshold = 0
elif size < 10:
threshold = 1e-4
Expand Down Expand Up @@ -161,8 +163,8 @@ def test_horovod_allreduce_gpu_prescale(self):
def hvd_allreduce_test(self, dtype, dim):
np.random.seed(1234)
factor = np.random.uniform()
tensor = self.random_uniform(
[17] * dim, -100, 100, dtype=dtype)
tensor = self.random_uniform([17] * dim, -100, 100)
tensor = tf.cast(tensor, dtype=dtype)
summed = hvd.allreduce(tensor, average=False,
prescale_factor=factor)

Expand All @@ -172,12 +174,14 @@ def hvd_allreduce_test(self, dtype, dim):
factor = tf.convert_to_tensor(
factor, tf.float64 if dtype in int_types else dtype)
multiplied = tf.cast(factor * tensor, dtype) * size
max_difference = tf.reduce_max(tf.abs(summed - multiplied))
difference = summed - multiplied
difference = tf.cast(difference, tf.int32) if dtype == tf.uint8 else difference
max_difference = tf.reduce_max(tf.abs(difference))
return max_difference

dtypes = self.filter_supported_types(
[tf.int32, tf.int64, tf.float16, tf.float32])
int_types = [tf.int32, tf.int64]
[tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32])
int_types = [tf.uint8, tf.int8, tf.int32, tf.int64]
dims = [1, 2, 3]
for dtype, dim in itertools.product(dtypes, dims):
with tf.device("/gpu:%s" % local_rank):
Expand Down Expand Up @@ -217,8 +221,8 @@ def test_horovod_allreduce_gpu_postscale(self):
def hvd_allreduce_test(self, dtype, dim):
np.random.seed(1234)
factor = np.random.uniform()
tensor = self.random_uniform(
[17] * dim, -100, 100, dtype=dtype)
tensor = self.random_uniform([17] * dim, -100, 100)
tensor = tf.cast(tensor, dtype=dtype)
summed = hvd.allreduce(tensor, average=False,
postscale_factor=factor)

Expand All @@ -229,13 +233,15 @@ def hvd_allreduce_test(self, dtype, dim):
factor = tf.convert_to_tensor(
factor, tf.float64 if dtype in int_types else dtype)
multiplied = tf.cast(factor * multiplied, dtype)
max_difference = tf.reduce_max(tf.abs(summed - multiplied))
difference = summed - multiplied
difference = tf.cast(difference, tf.int32) if dtype == tf.uint8 else difference
max_difference = tf.reduce_max(tf.abs(difference))
return max_difference

local_rank = hvd.local_rank()
dtypes = self.filter_supported_types(
[tf.int32, tf.int64, tf.float16, tf.float32])
int_types = [tf.int32, tf.int64]
[tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32])
int_types = [tf.uint8, tf.int8, tf.int32, tf.int64]
dims = [1, 2, 3]
for dtype, dim in itertools.product(dtypes, dims):
with tf.device("/gpu:%s" % local_rank):
Expand Down
16 changes: 9 additions & 7 deletions test/parallel/test_xla_process_sets.py
Expand Up @@ -86,10 +86,10 @@ def test_horovod_allreduce_gpu_process_sets(self):
odd_set = hvd.add_process_set(odd_ranks)

def allreduce_gpu_process_set(self, dtype, dim):
even_rank_tensor = self.random_uniform(
[17] * dim, -100, 100, dtype=dtype)
odd_rank_tensor = self.random_uniform(
[17] * dim, -100, 100, dtype=dtype)
even_rank_tensor = self.random_uniform([17] * dim, -100, 100)
even_rank_tensor = tf.cast(even_rank_tensor, dtype=dtype)
odd_rank_tensor = self.random_uniform([17] * dim, -100, 100)
odd_rank_tensor = tf.cast(odd_rank_tensor, dtype=dtype)
if rank in even_ranks:
summed = hvd.allreduce(
even_rank_tensor,
Expand All @@ -100,10 +100,12 @@ def allreduce_gpu_process_set(self, dtype, dim):
summed = hvd.allreduce(
odd_rank_tensor, average=False, process_set=odd_set)
multiplied = odd_rank_tensor * len(odd_ranks)
max_difference = tf.reduce_max(tf.abs(summed - multiplied))
difference = summed - multiplied
difference = tf.cast(difference, tf.int32) if dtype == tf.uint8 else difference
max_difference = tf.reduce_max(tf.abs(difference))
return max_difference

dtypes = [tf.int32, tf.int64, tf.float16, tf.float32, tf.float64]
dtypes = [tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64]
dims = [1, 2, 3]
for dtype, dim in itertools.product(dtypes, dims):
with tf.device("/gpu:%d" % local_rank):
Expand All @@ -113,7 +115,7 @@ def allreduce_gpu_process_set(self, dtype, dim):
# Threshold for floating point equality depends on number of
# ranks, since we're comparing against precise multiplication.
max_process_set_size = max(len(even_ranks), len(odd_ranks))
if max_process_set_size <= 3 or dtype in [tf.int32, tf.int64]:
if max_process_set_size <= 3 or dtype in [tf.uint8, tf.int8, tf.int32, tf.int64]:
threshold = 0
elif max_process_set_size < 10:
threshold = 1e-4
Expand Down

0 comments on commit 83035ba

Please sign in to comment.