From 83035ba014a83ac0a9bb6dc5a84382f94f396223 Mon Sep 17 00:00:00 2001 From: Vignesh Kothapalli Date: Tue, 16 Aug 2022 11:38:47 -0700 Subject: [PATCH] extend uint8 and int8 allreduce tests to xla and process sets Signed-off-by: Vignesh Kothapalli --- test/parallel/test_tensorflow_process_sets.py | 60 +++++++++++-------- test/parallel/test_xla.py | 36 ++++++----- test/parallel/test_xla_process_sets.py | 16 ++--- 3 files changed, 66 insertions(+), 46 deletions(-) diff --git a/test/parallel/test_tensorflow_process_sets.py b/test/parallel/test_tensorflow_process_sets.py index 423e904677..74084296ad 100644 --- a/test/parallel/test_tensorflow_process_sets.py +++ b/test/parallel/test_tensorflow_process_sets.py @@ -88,24 +88,28 @@ def test_horovod_allreduce_cpu_process_sets(self): even_set = hvd.add_process_set(even_ranks) odd_set = hvd.add_process_set(odd_ranks) - dtypes = self.filter_supported_types([tf.int32, tf.int64, tf.float16, tf.float32, tf.float64]) + dtypes = self.filter_supported_types([tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64]) dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/cpu:0"): - even_rank_tensor = self.random_uniform([17] * dim, -100, 100, dtype=dtype) - odd_rank_tensor = self.random_uniform([17] * dim, -100, 100, dtype=dtype) + even_rank_tensor = self.random_uniform([17] * dim, -100, 100) + even_rank_tensor = tf.cast(even_rank_tensor, dtype=dtype) + odd_rank_tensor = self.random_uniform([17] * dim, -100, 100) + odd_rank_tensor = tf.cast(odd_rank_tensor, dtype=dtype) if rank in even_ranks: summed = hvd.allreduce(even_rank_tensor, average=False, process_set=even_set) multiplied = even_rank_tensor * len(even_ranks) if rank in odd_ranks: summed = hvd.allreduce(odd_rank_tensor, average=False, process_set=odd_set) multiplied = odd_rank_tensor * len(odd_ranks) - max_difference = tf.reduce_max(tf.abs(summed - multiplied)) + difference = summed - multiplied + difference = tf.cast(difference, tf.int32) if dtype == tf.uint8 else difference + max_difference = tf.reduce_max(tf.abs(difference)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. max_process_set_size = max(len(even_ranks), len(odd_ranks)) - if max_process_set_size <= 3 or dtype in [tf.int32, tf.int64]: + if max_process_set_size <= 3 or dtype in [tf.uint8, tf.int8, tf.int32, tf.int64]: threshold = 0 elif max_process_set_size < 10: threshold = 1e-4 @@ -141,24 +145,28 @@ def test_horovod_allreduce_gpu_process_sets(self): even_set = hvd.add_process_set(even_ranks) odd_set = hvd.add_process_set(odd_ranks) - dtypes = [tf.int32, tf.int64, tf.float16, tf.float32, tf.float64] + dtypes = [tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/gpu:%d" % local_rank): - even_rank_tensor = self.random_uniform([17] * dim, -100, 100, dtype=dtype) - odd_rank_tensor = self.random_uniform([17] * dim, -100, 100, dtype=dtype) + even_rank_tensor = self.random_uniform([17] * dim, -100, 100) + even_rank_tensor = tf.cast(even_rank_tensor, dtype=dtype) + odd_rank_tensor = self.random_uniform([17] * dim, -100, 100) + odd_rank_tensor = tf.cast(odd_rank_tensor, dtype=dtype) if rank in even_ranks: summed = hvd.allreduce(even_rank_tensor, average=False, process_set=even_set) multiplied = even_rank_tensor * len(even_ranks) if rank in odd_ranks: summed = hvd.allreduce(odd_rank_tensor, average=False, process_set=odd_set) multiplied = odd_rank_tensor * len(odd_ranks) - max_difference = tf.reduce_max(tf.abs(summed - multiplied)) + difference = summed - multiplied + difference = tf.cast(difference, tf.int32) if dtype == tf.uint8 else difference + max_difference = tf.reduce_max(tf.abs(difference)) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. max_process_set_size = max(len(even_ranks), len(odd_ranks)) - if max_process_set_size <= 3 or dtype in [tf.int32, tf.int64]: + if max_process_set_size <= 3 or dtype in [tf.uint8, tf.int8, tf.int32, tf.int64]: threshold = 0 elif max_process_set_size < 10: threshold = 1e-4 @@ -285,26 +293,28 @@ def test_horovod_grouped_allreduce_cpu_process_sets(self): even_set = hvd.add_process_set(even_ranks) odd_set = hvd.add_process_set(odd_ranks) - dtypes = self.filter_supported_types([tf.int32, tf.int64, tf.float16, tf.float32, tf.float64]) + dtypes = self.filter_supported_types([tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64]) dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/cpu:0"): - even_rank_tensors = [self.random_uniform( - [17] * dim, -100, 100, dtype=dtype) for _ in range(5)] - odd_rank_tensors = [self.random_uniform( - [17] * dim, -100, 100, dtype=dtype) for _ in range(5)] + even_rank_tensors = [tf.cast(self.random_uniform( + [17] * dim, -100, 100), dtype=dtype) for _ in range(5)] + odd_rank_tensors = [tf.cast(self.random_uniform( + [17] * dim, -100, 100), dtype=dtype) for _ in range(5)] if rank in even_ranks: summed = hvd.grouped_allreduce(even_rank_tensors, average=False, process_set=even_set) multiplied = [tensor * len(even_ranks) for tensor in even_rank_tensors] elif rank in odd_ranks: summed = hvd.grouped_allreduce(odd_rank_tensors, average=False, process_set=odd_set) multiplied = [tensor * len(odd_ranks) for tensor in odd_rank_tensors] - max_difference = tf.reduce_max([tf.reduce_max(tf.abs(t1 - t2)) for t1, t2 in zip(summed, multiplied)]) + differences = [t1 - t2 for t1, t2 in zip(summed, multiplied)] + differences = [tf.cast(diff, tf.int32) if dtype == tf.uint8 else diff for diff in differences] + max_difference = tf.reduce_max([tf.reduce_max(tf.abs(diff)) for diff in differences]) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. max_process_set_size = max(len(even_ranks), len(odd_ranks)) - if max_process_set_size <= 3 or dtype in [tf.int32, tf.int64]: + if max_process_set_size <= 3 or dtype in [tf.uint8, tf.int8, tf.int32, tf.int64]: threshold = 0 elif max_process_set_size < 10: threshold = 1e-4 @@ -337,26 +347,28 @@ def test_horovod_grouped_allreduce_gpu_process_sets(self): even_set = hvd.add_process_set(even_ranks) odd_set = hvd.add_process_set(odd_ranks) - dtypes = self.filter_supported_types([tf.int32, tf.int64, tf.float16, tf.float32, tf.float64]) + dtypes = self.filter_supported_types([tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64]) dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/gpu:%d" % local_rank): - even_rank_tensors = [self.random_uniform( - [17] * dim, -100, 100, dtype=dtype) for _ in range(5)] - odd_rank_tensors = [self.random_uniform( - [17] * dim, -100, 100, dtype=dtype) for _ in range(5)] + even_rank_tensors = [tf.cast(self.random_uniform( + [17] * dim, -100, 100), dtype=dtype) for _ in range(5)] + odd_rank_tensors = [tf.cast(self.random_uniform( + [17] * dim, -100, 100), dtype=dtype) for _ in range(5)] if rank in even_ranks: summed = hvd.grouped_allreduce(even_rank_tensors, average=False, process_set=even_set) multiplied = [tensor * len(even_ranks) for tensor in even_rank_tensors] elif rank in odd_ranks: summed = hvd.grouped_allreduce(odd_rank_tensors, average=False, process_set=odd_set) multiplied = [tensor * len(odd_ranks) for tensor in odd_rank_tensors] - max_difference = tf.reduce_max([tf.reduce_max(tf.abs(t1 - t2)) for t1, t2 in zip(summed, multiplied)]) + differences = [t1 - t2 for t1, t2 in zip(summed, multiplied)] + differences = [tf.cast(diff, tf.int32) if dtype == tf.uint8 else diff for diff in differences] + max_difference = tf.reduce_max([tf.reduce_max(tf.abs(diff)) for diff in differences]) # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. max_process_set_size = max(len(even_ranks), len(odd_ranks)) - if max_process_set_size <= 3 or dtype in [tf.int32, tf.int64]: + if max_process_set_size <= 3 or dtype in [tf.uint8, tf.int8, tf.int32, tf.int64]: threshold = 0 elif max_process_set_size < 10: threshold = 1e-4 diff --git a/test/parallel/test_xla.py b/test/parallel/test_xla.py index 8581e02052..011e32869c 100644 --- a/test/parallel/test_xla.py +++ b/test/parallel/test_xla.py @@ -111,14 +111,16 @@ def test_horovod_allreduce_gpu(self): size = hvd.size() def hvd_allreduce_test(self, dtype, dim): - tensor = self.random_uniform( - [17] * dim, -100, 100, dtype=dtype) + tensor = self.random_uniform([17] * dim, -100, 100) + tensor = tf.cast(tensor, dtype=dtype) summed = hvd.allreduce(tensor, average=False) multiplied = tensor * size - max_difference = tf.reduce_max(tf.abs(summed - multiplied)) + difference = summed - multiplied + difference = tf.cast(difference, tf.int32) if dtype == tf.uint8 else difference + max_difference = tf.reduce_max(tf.abs(difference)) return max_difference - dtypes = [tf.int32, tf.int64, tf.float32, tf.float16, tf.float64] + dtypes = [tf.uint8, tf.int8, tf.int32, tf.int64, tf.float32, tf.float16, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/gpu:%d" % local_rank): @@ -127,7 +129,7 @@ def hvd_allreduce_test(self, dtype, dim): # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. - if size <= 3 or dtype in [tf.int32, tf.int64]: + if size <= 3 or dtype in [tf.uint8, tf.int8, tf.int32, tf.int64]: threshold = 0 elif size < 10: threshold = 1e-4 @@ -161,8 +163,8 @@ def test_horovod_allreduce_gpu_prescale(self): def hvd_allreduce_test(self, dtype, dim): np.random.seed(1234) factor = np.random.uniform() - tensor = self.random_uniform( - [17] * dim, -100, 100, dtype=dtype) + tensor = self.random_uniform([17] * dim, -100, 100) + tensor = tf.cast(tensor, dtype=dtype) summed = hvd.allreduce(tensor, average=False, prescale_factor=factor) @@ -172,12 +174,14 @@ def hvd_allreduce_test(self, dtype, dim): factor = tf.convert_to_tensor( factor, tf.float64 if dtype in int_types else dtype) multiplied = tf.cast(factor * tensor, dtype) * size - max_difference = tf.reduce_max(tf.abs(summed - multiplied)) + difference = summed - multiplied + difference = tf.cast(difference, tf.int32) if dtype == tf.uint8 else difference + max_difference = tf.reduce_max(tf.abs(difference)) return max_difference dtypes = self.filter_supported_types( - [tf.int32, tf.int64, tf.float16, tf.float32]) - int_types = [tf.int32, tf.int64] + [tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32]) + int_types = [tf.uint8, tf.int8, tf.int32, tf.int64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/gpu:%s" % local_rank): @@ -217,8 +221,8 @@ def test_horovod_allreduce_gpu_postscale(self): def hvd_allreduce_test(self, dtype, dim): np.random.seed(1234) factor = np.random.uniform() - tensor = self.random_uniform( - [17] * dim, -100, 100, dtype=dtype) + tensor = self.random_uniform([17] * dim, -100, 100) + tensor = tf.cast(tensor, dtype=dtype) summed = hvd.allreduce(tensor, average=False, postscale_factor=factor) @@ -229,13 +233,15 @@ def hvd_allreduce_test(self, dtype, dim): factor = tf.convert_to_tensor( factor, tf.float64 if dtype in int_types else dtype) multiplied = tf.cast(factor * multiplied, dtype) - max_difference = tf.reduce_max(tf.abs(summed - multiplied)) + difference = summed - multiplied + difference = tf.cast(difference, tf.int32) if dtype == tf.uint8 else difference + max_difference = tf.reduce_max(tf.abs(difference)) return max_difference local_rank = hvd.local_rank() dtypes = self.filter_supported_types( - [tf.int32, tf.int64, tf.float16, tf.float32]) - int_types = [tf.int32, tf.int64] + [tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32]) + int_types = [tf.uint8, tf.int8, tf.int32, tf.int64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/gpu:%s" % local_rank): diff --git a/test/parallel/test_xla_process_sets.py b/test/parallel/test_xla_process_sets.py index 71261804db..f6b1db12ba 100644 --- a/test/parallel/test_xla_process_sets.py +++ b/test/parallel/test_xla_process_sets.py @@ -86,10 +86,10 @@ def test_horovod_allreduce_gpu_process_sets(self): odd_set = hvd.add_process_set(odd_ranks) def allreduce_gpu_process_set(self, dtype, dim): - even_rank_tensor = self.random_uniform( - [17] * dim, -100, 100, dtype=dtype) - odd_rank_tensor = self.random_uniform( - [17] * dim, -100, 100, dtype=dtype) + even_rank_tensor = self.random_uniform([17] * dim, -100, 100) + even_rank_tensor = tf.cast(even_rank_tensor, dtype=dtype) + odd_rank_tensor = self.random_uniform([17] * dim, -100, 100) + odd_rank_tensor = tf.cast(odd_rank_tensor, dtype=dtype) if rank in even_ranks: summed = hvd.allreduce( even_rank_tensor, @@ -100,10 +100,12 @@ def allreduce_gpu_process_set(self, dtype, dim): summed = hvd.allreduce( odd_rank_tensor, average=False, process_set=odd_set) multiplied = odd_rank_tensor * len(odd_ranks) - max_difference = tf.reduce_max(tf.abs(summed - multiplied)) + difference = summed - multiplied + difference = tf.cast(difference, tf.int32) if dtype == tf.uint8 else difference + max_difference = tf.reduce_max(tf.abs(difference)) return max_difference - dtypes = [tf.int32, tf.int64, tf.float16, tf.float32, tf.float64] + dtypes = [tf.uint8, tf.int8, tf.int32, tf.int64, tf.float16, tf.float32, tf.float64] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): with tf.device("/gpu:%d" % local_rank): @@ -113,7 +115,7 @@ def allreduce_gpu_process_set(self, dtype, dim): # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. max_process_set_size = max(len(even_ranks), len(odd_ranks)) - if max_process_set_size <= 3 or dtype in [tf.int32, tf.int64]: + if max_process_set_size <= 3 or dtype in [tf.uint8, tf.int8, tf.int32, tf.int64]: threshold = 0 elif max_process_set_size < 10: threshold = 1e-4