Skip to content

Commit

Permalink
feat: support list-like syntax to round robin CUDA devices (#5187)
Browse files Browse the repository at this point in the history
  • Loading branch information
JoanFM committed Sep 21, 2022
1 parent c17e641 commit fd4c034
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 25 deletions.
11 changes: 11 additions & 0 deletions docs/fundamentals/flow/topologies.md
Expand Up @@ -149,6 +149,17 @@ You can also restrict the visible devices in round-robin assignment by `CUDA_VIS
| 0 | 4 |


You can also restrict the visible devices in round-robin assignment by assigning a list of devices ids `CUDA_VISIBLE_DEVICES=RR1,3`. This will create the following assignment:

| GPU device | Replica ID |
|------------|------------|
| 1 | 0 |
| 3 | 1 |
| 1 | 2 |
| 3 | 3 |
| 1 | 4 |


(partition-data-by-using-shards)=
## Partition data with shards

Expand Down
35 changes: 23 additions & 12 deletions jina/orchestrate/deployments/__init__.py
Expand Up @@ -622,25 +622,32 @@ def is_ready(self) -> bool:
return is_ready

@staticmethod
def _parse_slice(value: str):
"""Parses a `slice()` from string, like `start:stop:step`.
def _parse_devices(value: str, num_devices: int):
"""Parses a list of devices from string, like `start:stop:step` or 'num1,num2,num3` or combination of both.
:param value: a string like
:param num_devices: total number of devices
:return: slice
"""

all_devices = range(num_devices)
if re.match(WRAPPED_SLICE_BASE, value):
value = value[1:-1]

if value:
parts = value.split(':')
parts = value.split(',')
if len(parts) == 1:
# slice(stop)
parts = [parts[0], str(int(parts[0]) + 1)]
# else: slice(start, stop[, step])
parts = value.split(':')

if len(parts) == 1:
# slice(stop)
parts = [parts[0], str(int(parts[0]) + 1)]
# else: slice(start, stop[, step])
else:
return [int(p) for p in parts]
else:
# slice()
parts = []
return slice(*[int(p) if p else None for p in parts])
return all_devices[slice(*[int(p) if p else None for p in parts])]

@staticmethod
def _roundrobin_cuda_device(device_str: str, replicas: int):
Expand All @@ -650,6 +657,7 @@ def _roundrobin_cuda_device(device_str: str, replicas: int):
:param replicas: the number of replicas
:return: a map from replica id to device id
"""
print(f' device_str {device_str}')
if (
device_str
and isinstance(device_str, str)
Expand All @@ -665,11 +673,14 @@ def _roundrobin_cuda_device(device_str: str, replicas: int):
if num_devices == 0:
return

all_devices = list(range(num_devices))
selected_devices = []
if device_str[2:]:
all_devices = all_devices[Deployment._parse_slice(device_str[2:])]

_c = cycle(all_devices)
for device_num in Deployment._parse_devices(device_str[2:], num_devices):
selected_devices.append(device_num)
else:
selected_devices = range(num_devices)
print(f' selected devices {selected_devices}')
_c = cycle(selected_devices)
return {j: next(_c) for j in range(replicas)}

@staticmethod
Expand Down
38 changes: 25 additions & 13 deletions tests/unit/orchestrate/deployments/test_cuda_assignment.py
Expand Up @@ -5,21 +5,33 @@
from jina.orchestrate.deployments import Deployment


@pytest.fixture()
def cuda_total_devices(request):
old_cuda_total_devices = os.environ.get('CUDA_TOTAL_DEVICES', None)
os.environ['CUDA_TOTAL_DEVICES'] = str(request.param)
yield
if old_cuda_total_devices is not None:
os.environ['CUDA_TOTAL_DEVICES'] = old_cuda_total_devices
else:
os.unsetenv('CUDA_TOTAL_DEVICES')


@pytest.mark.parametrize(
'device_str, replicas, expected',
'device_str, replicas, expected, cuda_total_devices',
[
['1', 1, None], # wont trigger device RB
['1', 2, None], # wont trigger device RB
['1,2', 2, None], # wont trigger device RB
['RR', 2, {0: 0, 1: 1}],
['RR', 5, {0: 0, 1: 1, 2: 2, 3: 0, 4: 1}],
['RR1:', 5, {0: 1, 1: 2, 2: 1, 3: 2, 4: 1}],
['RR0:2', 5, {0: 0, 1: 1, 2: 0, 3: 1, 4: 0}],
['RR1:2', 2, {0: 1, 1: 1}],
['RR1:2', 1, {0: 1}],
],
['1', 1, None, 3], # wont trigger device RB
['1', 2, None, 3], # wont trigger device RB
['1,2', 2, None, 3], # wont trigger device RB
['RR', 2, {0: 0, 1: 1}, 3],
['RR', 5, {0: 0, 1: 1, 2: 2, 3: 0, 4: 1}, 3],
['RR1:', 5, {0: 1, 1: 2, 2: 1, 3: 2, 4: 1}, 3],
['RR0:2', 5, {0: 0, 1: 1, 2: 0, 3: 1, 4: 0}, 3],
['RR1:2', 2, {0: 1, 1: 1}, 3],
['RR1:2', 1, {0: 1}, 3],
['RR0,2,3', 3, {0: 0, 1: 2, 2: 3}, 4],
['RR0,2,3', 5, {0: 0, 1: 2, 2: 3, 3: 0, 4: 2}, 4],
], indirect=['cuda_total_devices']
)
def test_cuda_assignment(device_str, replicas, expected):
os.environ['CUDA_TOTAL_DEVICES'] = str(3)
def test_cuda_assignment(device_str, replicas, expected, cuda_total_devices):
actual = Deployment._roundrobin_cuda_device(device_str, replicas)
assert actual == expected

0 comments on commit fd4c034

Please sign in to comment.