add a2 machine into batch (#14542)

This PR adds the a2 machine family into batch as job private instance machines. These machines have much more powerful gpus than the g2 family of machines currently in batch. --------- Co-authored-by: Sophie Parsa <parsa@wm9c6-e4e.broadinstitute.org>
hail-is · May 15, 2024 · 1edd50d · 1edd50d
1 parent 88e597f
commit 1edd50d
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 5 deletions.
diff --git a/batch/batch/cloud/gcp/driver/pricing.py b/batch/batch/cloud/gcp/driver/pricing.py
@@ -181,13 +181,19 @@ def instance_family_from_sku(sku: dict) -> Optional[str]:
         return 'n1'
     if sku['description'].startswith("G2 Instance") or sku['description'].startswith("Spot Preemptible G2 Instance"):
         return 'g2'
+    if sku['description'].startswith("A2 Instance") or sku['description'].startswith("Spot Preemptible A2 Instance"):
+        return 'a2'
     return None
 
 
 def accelerator_from_sku(sku) -> Optional[str]:
     description = sku['description']
     if description.startswith("Nvidia L4 GPU"):
         return 'l4'
+    if description.startswith("Nvidia Tesla A100 GPU"):
+        return 'a100-40gb'
+    if description.startswith("Nvidia Tesla A100 80GB GPU"):
+        return 'a100-80gb'
     return None
 
 

diff --git a/batch/batch/cloud/gcp/resource_utils.py b/batch/batch/cloud/gcp/resource_utils.py
@@ -249,16 +249,77 @@ def __init__(self, machine_family: str, worker_type: str, cores: int, memory: in
         machine_family='g2',
         worker_type='standard',
     ),
+    'a2-highgpu-1g': MachineTypeParts(
+        cores=12,
+        memory=gib_to_bytes(85),
+        gpu_config=GPUConfig(num_gpus=1, gpu_type='a100-40gb'),
+        machine_family='a2',
+        worker_type='highgpu',
+    ),
+    'a2-highgpu-2g': MachineTypeParts(
+        cores=24,
+        memory=gib_to_bytes(170),
+        gpu_config=GPUConfig(num_gpus=2, gpu_type='a100-40gb'),
+        machine_family='a2',
+        worker_type='highgpu',
+    ),
+    'a2-highgpu-4g': MachineTypeParts(
+        cores=48,
+        memory=gib_to_bytes(340),
+        gpu_config=GPUConfig(num_gpus=4, gpu_type='a100-40gb'),
+        machine_family='a2',
+        worker_type='highgpu',
+    ),
+    'a2-highgpu-8g': MachineTypeParts(
+        cores=96,
+        memory=gib_to_bytes(680),
+        gpu_config=GPUConfig(num_gpus=8, gpu_type='a100-40gb'),
+        machine_family='a2',
+        worker_type='highgpu',
+    ),
+    'a2-megagpu-16g': MachineTypeParts(
+        cores=96,
+        memory=gib_to_bytes(1360),
+        gpu_config=GPUConfig(num_gpus=16, gpu_type='a100-40gb'),
+        machine_family='a2',
+        worker_type='megagpu',
+    ),
+    'a2-ultragpu-1g': MachineTypeParts(
+        cores=12,
+        memory=gib_to_bytes(170),
+        gpu_config=GPUConfig(num_gpus=1, gpu_type='a100-80gb'),
+        machine_family='a2',
+        worker_type='ultragpu',
+    ),
+    'a2-ultragpu-2g': MachineTypeParts(
+        cores=24,
+        memory=gib_to_bytes(340),
+        gpu_config=GPUConfig(num_gpus=2, gpu_type='a100-80gb'),
+        machine_family='a2',
+        worker_type='ultragpu',
+    ),
+    'a2-ultragpu-4g': MachineTypeParts(
+        cores=48,
+        memory=gib_to_bytes(680),
+        gpu_config=GPUConfig(num_gpus=4, gpu_type='a100-80gb'),
+        machine_family='a2',
+        worker_type='ultragpu',
+    ),
+    'a2-ultragpu-8g': MachineTypeParts(
+        cores=96,
+        memory=gib_to_bytes(1360),
+        gpu_config=GPUConfig(num_gpus=8, gpu_type='a100-80gb'),
+        machine_family='a2',
+        worker_type='ultragpu',
+    ),
 }
 
-
-gcp_valid_cores_from_worker_type = {
+gcp_valid_cores_for_pool_worker_type = {
     'highcpu': [2, 4, 8, 16, 32, 64, 96],
     'standard': [1, 2, 4, 8, 16, 32, 64, 96],
     'highmem': [2, 4, 8, 16, 32, 64, 96],
 }
 
-
 gcp_valid_machine_types = list(MACHINE_TYPE_TO_PARTS.keys())
 
 gcp_memory_to_worker_type = {'lowmem': 'highcpu', 'standard': 'standard', 'highmem': 'highmem'}

diff --git a/batch/batch/cloud/resource_utils.py b/batch/batch/cloud/resource_utils.py
@@ -18,7 +18,7 @@
     gcp_machine_type_to_cores_and_memory_bytes,
     gcp_memory_to_worker_type,
     gcp_requested_to_actual_storage_bytes,
-    gcp_valid_cores_from_worker_type,
+    gcp_valid_cores_for_pool_worker_type,
     gcp_valid_machine_types,
 )
 
@@ -33,7 +33,7 @@ def possible_cores_from_worker_type(cloud: str, worker_type: str) -> List[int]:
     if cloud == 'azure':
         return azure_valid_cores_from_worker_type[worker_type]
     assert cloud == 'gcp'
-    return gcp_valid_cores_from_worker_type[worker_type]
+    return gcp_valid_cores_for_pool_worker_type[worker_type]
 
 
 def valid_machine_types(cloud: str) -> List[str]:

diff --git a/batch/test/test_utils.py b/batch/test/test_utils.py
@@ -49,6 +49,15 @@ def test_gcp_machine_memory_per_core_mib():
             assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 924
         elif machine_parts.machine_family == 'g2' and machine_parts.worker_type == 'standard':
             assert int(machine_parts.memory / machine_parts.cores / 1024**2) == 4096
+        elif machine_parts.machine_family == 'a2' and machine_parts.worker_type == 'highgpu':
+            assert machine_parts.gpu_config
+            assert int(machine_parts.memory / machine_parts.gpu_config.num_gpus / 1024**3) == 85
+        elif machine_parts.machine_family == 'a2' and machine_parts.worker_type == 'megagpu':
+            assert machine_parts.gpu_config
+            assert int(machine_parts.memory / machine_parts.gpu_config.num_gpus / 1024**3) == 85
+        elif machine_parts.machine_family == 'a2' and machine_parts.worker_type == 'ultragpu':
+            assert machine_parts.gpu_config
+            assert int(machine_parts.memory / machine_parts.gpu_config.num_gpus / 1024**3) == 170
 
 
 def test_azure_machine_memory_per_core_mib():