In [1]:
import pycuda
import pycuda.driver as drv

print(f'{pycuda.VERSION_TEXT=}')
print(f'{drv.get_version()=}')
print(f'{drv.get_driver_version()=}')

drv.init()

pycuda.VERSION_TEXT='2021.1'
drv.get_version()=(11, 5, 0)
drv.get_driver_version()=11050


In [2]:
print("%d device(s) found." % drv.Device.count())

# cuda cores per SM
cores_per_sm = {
    '-1' : -1,  # "Graphics Device"
    '3.0': 192, # Kepler
    '3.2': 192, # Kepler
    '3.5': 192, # Kepler
    '3.7': 192, # Kepler
    '5.0': 128, # Maxwell
    '5.2': 128, # Maxwell
    '5.3': 128, # Maxwell
    '6.0': 64,  # Pascal
    '6.1': 128, # Pascal
    '6.2': 128, # Pascal
    '7.0': 64,  # Volta
    '7.2': 64,  # Xavier
    '7.5': 64,  # Turing
    '8.0': 64,  # Ampere
    '8.6': 128  # Ampere
}

# Max CUDA cores ops per cycle per core
# FP64, FP32, FP16, INT8
fp_ops = {
    '7.0':  (1,    2, 8, 8),
    '7.5':  (1/16, 2, 8, 8),
    '8.0':  (1,    2, 8, 8),
    '8.6':  (1,    2, 8, 8)
}
# Max Tensor cores ops per cycle per SM
# FP64, FP32, FP16, INT8, INT4, INT1
tc_ops = {
    '7.0': (0,      0, 1024,    0,    0,     0),
    '7.5': (0,      0, 1024, 2048, 4096, 16384),
    '8.0': (128, 1024, 2048, 4096, 8192, 32768),
    '8.6': (128, 1024, 2048, 4096, 8192, 32768)
    # Ampere sparse (0, 2048, 4096, 8192, 16384, 0) ?
}

for ordinal in range(drv.Device.count()):
    dev = drv.Device(ordinal)
    attributes = dev.get_attributes()
    sms = attributes[drv.device_attribute.MULTIPROCESSOR_COUNT]
    mcr = attributes[drv.device_attribute.MEMORY_CLOCK_RATE]
    bus = attributes[drv.device_attribute.GLOBAL_MEMORY_BUS_WIDTH]
    l2s = attributes[drv.device_attribute.L2_CACHE_SIZE]
    clk = attributes[drv.device_attribute.CLOCK_RATE]
    cap = dev.compute_capability()
    arch = f'{cap[0]}.{cap[1]}'
    cores = cores_per_sm[arch] * sms
    
    print("Device #%d: %s" % (ordinal, dev.name()))
    print(f"  Arch: {arch}"
          f", Mem: {dev.total_memory()//(1<<20)} MB, {mcr*1000*bus*2*1e-9/8:.0f} GBps")

    print(f"  {sms} SMs, {cores} cores, {l2s / (1<<20)} MB L2"
          f", {clk * 1e-6} GHz")
    print(f"     TF/s:"
          f"    {fp_ops[arch][0] * clk * cores * 1e-9:6.2f} FP64"
          f"    {fp_ops[arch][1] * clk * cores * 1e-9:6.2f} FP32"
          f"    {fp_ops[arch][2] * clk * cores * 1e-9:6.2f} FP16")
    print(f"  TC TF/s:"
          f"    {tc_ops[arch][0] * clk * sms * 1e-9:6.2f} FP64"
          f"    {tc_ops[arch][1] * clk * sms * 1e-9:6.2f} FP32"
          f"    {tc_ops[arch][2] * clk * sms * 1e-9:6.2f} FP16")
    print()

5 device(s) found.
Device #0: NVIDIA A100-SXM-80GB
  Arch: 8.0, Mem: 81251 MB, 2039 GBps
  108 SMs, 6912 cores, 40.0 MB L2, 1.41 GHz
     TF/s:      9.75 FP64     19.49 FP32     77.97 FP16
  TC TF/s:     19.49 FP64    155.93 FP32    311.87 FP16

Device #1: NVIDIA A100-SXM-80GB
  Arch: 8.0, Mem: 81251 MB, 2039 GBps
  108 SMs, 6912 cores, 40.0 MB L2, 1.41 GHz
     TF/s:      9.75 FP64     19.49 FP32     77.97 FP16
  TC TF/s:     19.49 FP64    155.93 FP32    311.87 FP16

Device #2: NVIDIA A100-SXM-80GB
  Arch: 8.0, Mem: 81251 MB, 2039 GBps
  108 SMs, 6912 cores, 40.0 MB L2, 1.41 GHz
     TF/s:      9.75 FP64     19.49 FP32     77.97 FP16
  TC TF/s:     19.49 FP64    155.93 FP32    311.87 FP16

Device #3: NVIDIA A100-SXM-80GB
  Arch: 8.0, Mem: 81251 MB, 2039 GBps
  108 SMs, 6912 cores, 40.0 MB L2, 1.41 GHz
     TF/s:      9.75 FP64     19.49 FP32     77.97 FP16
  TC TF/s:     19.49 FP64    155.93 FP32    311.87 FP16

Device #4: NVIDIA DGX Display
  Arch: 7.5, Mem: 3911 MB, 160 GBps
  14 SM

In [3]:
# is there a better way to check this?
import GPUtil
print("Utilization")
GPUtil.showUtilization()

Utilization
| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |
|  1 |  0% |  0% |
|  2 |  0% |  0% |
|  3 |  0% |  0% |
|  4 |  0% |  0% |


In [4]:
# select device
dev_id = 1
ctx = drv.Device(dev_id).make_context()
dev = drv.Context.get_device()
free_bytes, total_bytes = drv.mem_get_info()
print(f'Device {dev_id} {dev.name()}')
print(f'Total Mem: {total_bytes//(1024)} kB, Free: {free_bytes//(1024)} kB')

Device 1 NVIDIA A100-SXM-80GB
Total Mem: 83201216 kB, Free: 82776768 kB


In [5]:
# other stuff
def device_details(_dev):
    atts = [(str(att), value) 
            for att, value in list(_dev.get_attributes().items())]
    atts.sort()

    for att, value in atts:
        print(f"  {att}: {value}")
        
#device_details(dev)