Skip to content

Commit

Permalink
add a2 machine into batch (#14542)
Browse files Browse the repository at this point in the history
This PR adds the a2 machine family into batch as job private instance
machines. These machines have much more powerful gpus than the g2 family
of machines currently in batch.

---------

Co-authored-by: Sophie Parsa <parsa@wm9c6-e4e.broadinstitute.org>
  • Loading branch information
sjparsa and Sophie Parsa committed May 15, 2024
1 parent 88e597f commit 1edd50d
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 5 deletions.
6 changes: 6 additions & 0 deletions batch/batch/cloud/gcp/driver/pricing.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,13 +181,19 @@ def instance_family_from_sku(sku: dict) -> Optional[str]:
return 'n1'
if sku['description'].startswith("G2 Instance") or sku['description'].startswith("Spot Preemptible G2 Instance"):
return 'g2'
if sku['description'].startswith("A2 Instance") or sku['description'].startswith("Spot Preemptible A2 Instance"):
return 'a2'
return None


def accelerator_from_sku(sku) -> Optional[str]:
description = sku['description']
if description.startswith("Nvidia L4 GPU"):
return 'l4'
if description.startswith("Nvidia Tesla A100 GPU"):
return 'a100-40gb'
if description.startswith("Nvidia Tesla A100 80GB GPU"):
return 'a100-80gb'
return None


Expand Down
67 changes: 64 additions & 3 deletions batch/batch/cloud/gcp/resource_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,16 +249,77 @@ def __init__(self, machine_family: str, worker_type: str, cores: int, memory: in
machine_family='g2',
worker_type='standard',
),
'a2-highgpu-1g': MachineTypeParts(
cores=12,
memory=gib_to_bytes(85),
gpu_config=GPUConfig(num_gpus=1, gpu_type='a100-40gb'),
machine_family='a2',
worker_type='highgpu',
),
'a2-highgpu-2g': MachineTypeParts(
cores=24,
memory=gib_to_bytes(170),
gpu_config=GPUConfig(num_gpus=2, gpu_type='a100-40gb'),
machine_family='a2',
worker_type='highgpu',
),
'a2-highgpu-4g': MachineTypeParts(
cores=48,
memory=gib_to_bytes(340),
gpu_config=GPUConfig(num_gpus=4, gpu_type='a100-40gb'),
machine_family='a2',
worker_type='highgpu',
),
'a2-highgpu-8g': MachineTypeParts(
cores=96,
memory=gib_to_bytes(680),
gpu_config=GPUConfig(num_gpus=8, gpu_type='a100-40gb'),
machine_family='a2',
worker_type='highgpu',
),
'a2-megagpu-16g': MachineTypeParts(
cores=96,
memory=gib_to_bytes(1360),
gpu_config=GPUConfig(num_gpus=16, gpu_type='a100-40gb'),
machine_family='a2',
worker_type='megagpu',
),
'a2-ultragpu-1g': MachineTypeParts(
cores=12,
memory=gib_to_bytes(170),
gpu_config=GPUConfig(num_gpus=1, gpu_type='a100-80gb'),
machine_family='a2',
worker_type='ultragpu',
),
'a2-ultragpu-2g': MachineTypeParts(
cores=24,
memory=gib_to_bytes(340),
gpu_config=GPUConfig(num_gpus=2, gpu_type='a100-80gb'),
machine_family='a2',
worker_type='ultragpu',
),
'a2-ultragpu-4g': MachineTypeParts(
cores=48,
memory=gib_to_bytes(680),
gpu_config=GPUConfig(num_gpus=4, gpu_type='a100-80gb'),
machine_family='a2',
worker_type='ultragpu',
),
'a2-ultragpu-8g': MachineTypeParts(
cores=96,
memory=gib_to_bytes(1360),
gpu_config=GPUConfig(num_gpus=8, gpu_type='a100-80gb'),
machine_family='a2',
worker_type='ultragpu',
),
}


gcp_valid_cores_from_worker_type = {
gcp_valid_cores_for_pool_worker_type = {
'highcpu': [2, 4, 8, 16, 32, 64, 96],
'standard': [1, 2, 4, 8, 16, 32, 64, 96],
'highmem': [2, 4, 8, 16, 32, 64, 96],
}


gcp_valid_machine_types = list(MACHINE_TYPE_TO_PARTS.keys())

gcp_memory_to_worker_type = {'lowmem': 'highcpu', 'standard': 'standard', 'highmem': 'highmem'}
Expand Down
4 changes: 2 additions & 2 deletions batch/batch/cloud/resource_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
gcp_machine_type_to_cores_and_memory_bytes,
gcp_memory_to_worker_type,
gcp_requested_to_actual_storage_bytes,
gcp_valid_cores_from_worker_type,
gcp_valid_cores_for_pool_worker_type,
gcp_valid_machine_types,
)

Expand All @@ -33,7 +33,7 @@ def possible_cores_from_worker_type(cloud: str, worker_type: str) -> List[int]:
if cloud == 'azure':
return azure_valid_cores_from_worker_type[worker_type]
assert cloud == 'gcp'
return gcp_valid_cores_from_worker_type[worker_type]
return gcp_valid_cores_for_pool_worker_type[worker_type]


def valid_machine_types(cloud: str) -> List[str]:
Expand Down
9 changes: 9 additions & 0 deletions batch/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,15 @@ def test_gcp_machine_memory_per_core_mib():
assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 924
elif machine_parts.machine_family == 'g2' and machine_parts.worker_type == 'standard':
assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 4096
elif machine_parts.machine_family == 'a2' and machine_parts.worker_type == 'highgpu':
assert machine_parts.gpu_config
assert int(machine_parts.memory / machine_parts.gpu_config.num_gpus / 1024**3) == 85
elif machine_parts.machine_family == 'a2' and machine_parts.worker_type == 'megagpu':
assert machine_parts.gpu_config
assert int(machine_parts.memory / machine_parts.gpu_config.num_gpus / 1024**3) == 85
elif machine_parts.machine_family == 'a2' and machine_parts.worker_type == 'ultragpu':
assert machine_parts.gpu_config
assert int(machine_parts.memory / machine_parts.gpu_config.num_gpus / 1024**3) == 170


def test_azure_machine_memory_per_core_mib():
Expand Down

0 comments on commit 1edd50d

Please sign in to comment.