In [1]:
import os
from datetime import datetime

host = os.uname()[1]
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
print(f"{host}, ", dt_string)

blackwell,  07/01/2022 15:05:52


In [2]:
import numpy as np
import pycuda.autoinit
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import skcuda.cublas as cublas

import cuda_util

print()
print(f'{pycuda.VERSION_TEXT=}')
print(f'{drv.get_version()=}')
print(f'{drv.get_driver_version()=}')
print()

dev_id = 1

seed = 1234
np.random.seed(seed)
    
ctx = drv.Device(dev_id).make_context()
dev = drv.Context.get_device()
cuda_util.query_dev(drv, dev_id)

libcublas.so
<CDLL '/home/krocki/nv/cuBLAS/build/src/libcublas.so', handle 2e43ab0 at 0x7f17e1b5bac0>
<CDLL '/home/krocki/nv/cuBLAS/build/src/libcublas.so', handle 2e43ab0 at 0x7f17e1b5bac0>
11901

pycuda.VERSION_TEXT='2021.1'
drv.get_version()=(11, 5, 0)
drv.get_driver_version()=11050

Device #1: NVIDIA A100-SXM-80GB
  Arch: 8.0, Mem: 81251 MB, 2039 GBps
  108 SMs, 6912 cores, 40.0 MB L2, 1.41 GHz
     TF/s:      9.75 FP64     19.49 FP32     77.97 FP16
  TC TF/s:     19.49 FP64    155.93 FP32    311.87 FP16



In [3]:
def mmul(handle, M, N, K, repeat, dtype):

    A = np.array(np.random.rand(M, K), dtype=dtype, order='F').astype(dtype)
    B = np.array(np.random.rand(K, N), dtype=dtype, order='F').astype(dtype)
    
    #A = np.ones((M, K), dtype=dtype, order='F')
    #B = np.ones((K, N), dtype=dtype, order='F')
    
    #A = np.zeros((M, K), dtype=dtype, order='F')
    #B = np.zeros((K, N), dtype=dtype, order='F')
    
    d_A = gpuarray.to_gpu(A)
    d_B = gpuarray.to_gpu(B)

    m, k = d_A.shape
    k, n = d_B.shape

    d_C = gpuarray.empty((m, n), dtype=dtype, order='F')

    alpha = dtype(1)
    beta  = dtype(0)
    
    start = drv.Event()
    end   = drv.Event()

    # cublas<T>gemm(handle,
    # transa, transb, m, n, k, alpha,
    # A, lda,
    # B, ldb, beta,
    # C, ldc)
    
    xgemm = {
        np.float32: cublas.cublasSgemm,
        np.float64: cublas.cublasDgemm
    }
    
    gemm_gflop = 1e-9 * m * n * k * 2
    
    start.record()
    for i in range(repeat):
        xgemm[dtype](handle,
                     transa='n', transb='n',
                     m=m, n=n, k=k, alpha=alpha,
                     A=d_A.ptr, lda=m,
                     B=d_B.ptr, ldb=k, beta=beta,
                     C=d_C.ptr, ldc=m)
        
    end.record()
    end.synchronize()
    gemm_time = end.time_since(start) / (repeat * 1e3)

    d_C = d_C.reshape(d_C.shape, order = 'F')

    C = np.dot(A, B).astype(dtype)

    err = np.linalg.norm(d_C.get() - C)

    gflop_per_sec = gemm_gflop/gemm_time
    
    print(f'{err=:12g}, {m=:5}, {n=:5}, {k=:5}'
          f', time={gemm_time:.4f} ms'
          f', {gflop_per_sec:10.3f} GF/s')
    
    return gflop_per_sec

In [4]:
"""
test tensor core math
"""

import ctypes

"""
/*Enum for default math mode/tensor operation*/
typedef enum {
  CUBLAS_DEFAULT_MATH = 0,

  /* deprecated, same effect as using CUBLAS_COMPUTE_32F_FAST_16F, will be removed in a future release */
  CUBLAS_TENSOR_OP_MATH = 1,

  /* same as using matching _PEDANTIC compute type when using cublas<T>routine calls or cublasEx() calls with
     cudaDataType as compute type */
  CUBLAS_PEDANTIC_MATH = 2,

  /* allow accelerating single precision routines using TF32 tensor cores */
  CUBLAS_TF32_TENSOR_OP_MATH = 3,

  /* flag to force any reductons to use the accumulator type and not output type in case of mixed precision routines
     with lower size output type */
  CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION = 16,
} cublasMath_t;

typedef enum {
  CUBLAS_COMPUTE_16F = 64,           /* half - default */
  CUBLAS_COMPUTE_16F_PEDANTIC = 65,  /* half - pedantic */
  CUBLAS_COMPUTE_32F = 68,           /* float - default */
  CUBLAS_COMPUTE_32F_PEDANTIC = 69,  /* float - pedantic */
  CUBLAS_COMPUTE_32F_FAST_16F = 74,  /* float - fast, allows down-converting inputs to half or TF32 */
  CUBLAS_COMPUTE_32F_FAST_16BF = 75, /* float - fast, allows down-converting inputs to bfloat16 or TF32 */
  CUBLAS_COMPUTE_32F_FAST_TF32 = 77, /* float - fast, allows down-converting inputs to TF32 */
  CUBLAS_COMPUTE_64F = 70,           /* double - default */
  CUBLAS_COMPUTE_64F_PEDANTIC = 71,  /* double - pedantic */
  CUBLAS_COMPUTE_32I = 72,           /* signed 32-bit int - default */
  CUBLAS_COMPUTE_32I_PEDANTIC = 73,  /* signed 32-bit int - pedantic */
} cublasComputeType_t;
"""
CUBLAS_MATH_MODE = {
    'CUBLAS_DEFAULT_MATH': 0,
    'CUBLAS_TENSOR_OP_MATH': 1,
    'CUBLAS_PEDANTIC_MATH': 2,
    'CUBLAS_TF32_TENSOR_OP_MATH': 3,
    'CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION': 16
}

"""
cublasStatus_t cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode)
"""
def cublasSetMathMode(handle, lib, mode):
    status = lib._libcublas.cublasSetMathMode(handle, mode)
    lib.cublasCheckStatus(status)

"""
cublasStatus_t cublasGetMathMode(cublasHandle_t handle, cublasMath_t *mode)
"""
def cublasGetMathMode(handle, lib):
    mode = ctypes.c_int()
    status = lib._libcublas.cublasGetMathMode(handle, ctypes.byref(mode))
    lib.cublasCheckStatus(status)
    return mode.value

"""
cublasStatus_t cublasLoggerConfigure(
    int             logIsOn,
    int             logToStdOut,
    int             logToStdErr,
    const char*     logFileName)
"""
def cublasLoggerConfigure(lib, logIsOn, logToStdOut, logToStdErr, logFileName):
    status = lib._libcublas.cublasGetMathMode(logIsOn, logToStdOut, logToStdErr, logFileName)
    lib.cublasCheckStatus(status)

In [5]:
results = {}
# square
sizes = [64, 128, 256, 512, 1024, 2048, 4096, 8192]
precisions = [np.float64, np.float32]
cublas_modes = [
             'CUBLAS_DEFAULT_MATH',
             'CUBLAS_TENSOR_OP_MATH',
             'CUBLAS_PEDANTIC_MATH',
             'CUBLAS_TF32_TENSOR_OP_MATH',
             'CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION']

In [None]:
for dtype in precisions:
    results[dtype] = {}
    print(f'\n{dtype=}\n-----------')
    for mode in cublas_modes:

        handle = cublas.cublasCreate()
        cublasSetMathMode(handle, cublas, CUBLAS_MATH_MODE[mode])
        print(f'{mode=}, {cublasGetMathMode(handle, cublas)=}\n')

        results[dtype][mode] = []
        for i in sizes:
            M = N = K = i
            flops = mmul(handle, M, N, K, repeat=10, dtype=dtype)
            results[dtype][mode].append(flops)

        cublas.cublasDestroy(handle)


dtype=<class 'numpy.float64'>
-----------
mode='CUBLAS_DEFAULT_MATH', cublasGetMathMode(handle, cublas)=0

err= 2.39149e-13, m=   64, n=   64, k=   64, time=0.0000 ms,     14.423 GF/s
err=           0, m=  128, n=  128, k=  128, time=0.0000 ms,    349.153 GF/s
err=           0, m=  256, n=  256, k=  256, time=0.0000 ms,   1343.123 GF/s
err= 3.95789e-11, m=  512, n=  512, k=  512, time=0.0000 ms,   8758.204 GF/s
err= 2.17734e-10, m= 1024, n= 1024, k= 1024, time=0.0001 ms,  14772.577 GF/s
err= 1.19272e-09, m= 2048, n= 2048, k= 2048, time=0.0010 ms,  16631.740 GF/s
err= 6.63736e-09, m= 4096, n= 4096, k= 4096, time=0.0081 ms,  16920.425 GF/s
err= 3.72743e-08, m= 8192, n= 8192, k= 8192, time=0.0676 ms,  16276.475 GF/s
mode='CUBLAS_TENSOR_OP_MATH', cublasGetMathMode(handle, cublas)=1

err= 2.35486e-13, m=   64, n=   64, k=   64, time=0.0000 ms,     19.845 GF/s
err=           0, m=  128, n=  128, k=  128, time=0.0000 ms,    615.073 GF/s
err=           0, m=  256, n=  256, k=  256, time=0.000

In [None]:
print(results)
results_1 = results

In [None]:
# randn
for dtype in precisions:
    results[dtype] = {}
    print(f'\n{dtype=}\n-----------')
    for mode in cublas_modes:

        handle = cublas.cublasCreate()
        cublasSetMathMode(handle, cublas, CUBLAS_MATH_MODE[mode])
        print(f'{mode=}, {cublasGetMathMode(handle, cublas)=}\n')

        results[dtype][mode] = []
        for i in sizes:
            M = N = K = i
            flops = mmul(handle, M, N, K, repeat=10, dtype=dtype)
            results[dtype][mode].append(flops)

        cublas.cublasDestroy(handle)

In [None]:
# zeros
for dtype in precisions:
    results[dtype] = {}
    print(f'\n{dtype=}\n-----------')
    for mode in cublas_modes:

        handle = cublas.cublasCreate()
        cublasSetMathMode(handle, cublas, CUBLAS_MATH_MODE[mode])
        print(f'{mode=}, {cublasGetMathMode(handle, cublas)=}\n')

        results[dtype][mode] = []
        for i in sizes:
            M = N = K = i
            flops = mmul(handle, M, N, K, repeat=10, dtype=dtype)
            results[dtype][mode].append(flops)

        cublas.cublasDestroy(handle)