From fd4c03476d7afd8cb5f311a48befadf380147462 Mon Sep 17 00:00:00 2001 From: Joan Fontanals Date: Wed, 21 Sep 2022 16:45:23 +0200 Subject: [PATCH] feat: support list-like syntax to round robin CUDA devices (#5187) --- docs/fundamentals/flow/topologies.md | 11 ++++++ jina/orchestrate/deployments/__init__.py | 35 +++++++++++------ .../deployments/test_cuda_assignment.py | 38 ++++++++++++------- 3 files changed, 59 insertions(+), 25 deletions(-) diff --git a/docs/fundamentals/flow/topologies.md b/docs/fundamentals/flow/topologies.md index e4a5b3410216a..83b76d9410cd6 100644 --- a/docs/fundamentals/flow/topologies.md +++ b/docs/fundamentals/flow/topologies.md @@ -149,6 +149,17 @@ You can also restrict the visible devices in round-robin assignment by `CUDA_VIS | 0 | 4 | +You can also restrict the visible devices in round-robin assignment by assigning a list of devices ids `CUDA_VISIBLE_DEVICES=RR1,3`. This will create the following assignment: + +| GPU device | Replica ID | +|------------|------------| +| 1 | 0 | +| 3 | 1 | +| 1 | 2 | +| 3 | 3 | +| 1 | 4 | + + (partition-data-by-using-shards)= ## Partition data with shards diff --git a/jina/orchestrate/deployments/__init__.py b/jina/orchestrate/deployments/__init__.py index c8ef1cf2f9e01..c839c3e9fd5c6 100644 --- a/jina/orchestrate/deployments/__init__.py +++ b/jina/orchestrate/deployments/__init__.py @@ -622,25 +622,32 @@ def is_ready(self) -> bool: return is_ready @staticmethod - def _parse_slice(value: str): - """Parses a `slice()` from string, like `start:stop:step`. + def _parse_devices(value: str, num_devices: int): + """Parses a list of devices from string, like `start:stop:step` or 'num1,num2,num3` or combination of both. :param value: a string like + :param num_devices: total number of devices :return: slice """ + + all_devices = range(num_devices) if re.match(WRAPPED_SLICE_BASE, value): value = value[1:-1] if value: - parts = value.split(':') + parts = value.split(',') if len(parts) == 1: - # slice(stop) - parts = [parts[0], str(int(parts[0]) + 1)] - # else: slice(start, stop[, step]) + parts = value.split(':') + + if len(parts) == 1: + # slice(stop) + parts = [parts[0], str(int(parts[0]) + 1)] + # else: slice(start, stop[, step]) + else: + return [int(p) for p in parts] else: - # slice() parts = [] - return slice(*[int(p) if p else None for p in parts]) + return all_devices[slice(*[int(p) if p else None for p in parts])] @staticmethod def _roundrobin_cuda_device(device_str: str, replicas: int): @@ -650,6 +657,7 @@ def _roundrobin_cuda_device(device_str: str, replicas: int): :param replicas: the number of replicas :return: a map from replica id to device id """ + print(f' device_str {device_str}') if ( device_str and isinstance(device_str, str) @@ -665,11 +673,14 @@ def _roundrobin_cuda_device(device_str: str, replicas: int): if num_devices == 0: return - all_devices = list(range(num_devices)) + selected_devices = [] if device_str[2:]: - all_devices = all_devices[Deployment._parse_slice(device_str[2:])] - - _c = cycle(all_devices) + for device_num in Deployment._parse_devices(device_str[2:], num_devices): + selected_devices.append(device_num) + else: + selected_devices = range(num_devices) + print(f' selected devices {selected_devices}') + _c = cycle(selected_devices) return {j: next(_c) for j in range(replicas)} @staticmethod diff --git a/tests/unit/orchestrate/deployments/test_cuda_assignment.py b/tests/unit/orchestrate/deployments/test_cuda_assignment.py index 106ae71b1a97e..af384f18aee77 100644 --- a/tests/unit/orchestrate/deployments/test_cuda_assignment.py +++ b/tests/unit/orchestrate/deployments/test_cuda_assignment.py @@ -5,21 +5,33 @@ from jina.orchestrate.deployments import Deployment +@pytest.fixture() +def cuda_total_devices(request): + old_cuda_total_devices = os.environ.get('CUDA_TOTAL_DEVICES', None) + os.environ['CUDA_TOTAL_DEVICES'] = str(request.param) + yield + if old_cuda_total_devices is not None: + os.environ['CUDA_TOTAL_DEVICES'] = old_cuda_total_devices + else: + os.unsetenv('CUDA_TOTAL_DEVICES') + + @pytest.mark.parametrize( - 'device_str, replicas, expected', + 'device_str, replicas, expected, cuda_total_devices', [ - ['1', 1, None], # wont trigger device RB - ['1', 2, None], # wont trigger device RB - ['1,2', 2, None], # wont trigger device RB - ['RR', 2, {0: 0, 1: 1}], - ['RR', 5, {0: 0, 1: 1, 2: 2, 3: 0, 4: 1}], - ['RR1:', 5, {0: 1, 1: 2, 2: 1, 3: 2, 4: 1}], - ['RR0:2', 5, {0: 0, 1: 1, 2: 0, 3: 1, 4: 0}], - ['RR1:2', 2, {0: 1, 1: 1}], - ['RR1:2', 1, {0: 1}], - ], + ['1', 1, None, 3], # wont trigger device RB + ['1', 2, None, 3], # wont trigger device RB + ['1,2', 2, None, 3], # wont trigger device RB + ['RR', 2, {0: 0, 1: 1}, 3], + ['RR', 5, {0: 0, 1: 1, 2: 2, 3: 0, 4: 1}, 3], + ['RR1:', 5, {0: 1, 1: 2, 2: 1, 3: 2, 4: 1}, 3], + ['RR0:2', 5, {0: 0, 1: 1, 2: 0, 3: 1, 4: 0}, 3], + ['RR1:2', 2, {0: 1, 1: 1}, 3], + ['RR1:2', 1, {0: 1}, 3], + ['RR0,2,3', 3, {0: 0, 1: 2, 2: 3}, 4], + ['RR0,2,3', 5, {0: 0, 1: 2, 2: 3, 3: 0, 4: 2}, 4], + ], indirect=['cuda_total_devices'] ) -def test_cuda_assignment(device_str, replicas, expected): - os.environ['CUDA_TOTAL_DEVICES'] = str(3) +def test_cuda_assignment(device_str, replicas, expected, cuda_total_devices): actual = Deployment._roundrobin_cuda_device(device_str, replicas) assert actual == expected