# Check hardware

GPU: A6000

CPU: Ryzen3600

In [1]:
import numpy as np
import pandas as pd
import time
import torch
from tqdm.notebook import tqdm 

In [4]:
torch.__version__

'1.9.0'

In [7]:
!nvidia-smi

Tue Sep 28 14:36:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  RTX A6000           On   | 00000000:09:00.0 Off |                  Off |
| 30%   31C    P8     9W / 300W |   4742MiB / 48682MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

# Matmul test

In [2]:
WARM_UP = 10
BATCH_SIZE = 4
NUM_TEST = 100

Inference for TF32 vs FP32 on A6000

In [3]:
a_full = torch.randn(10240, 10240, dtype=torch.double, device='cuda')
b_full = torch.randn(10240, 10240, dtype=torch.double, device='cuda')

a = a_full.float()
b = b_full.float()

# The flag below controls whether to allow TF32 on matmul. This flag defaults to True.
torch.backends.cuda.matmul.allow_tf32 = True
# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
torch.backends.cudnn.allow_tf32 = True

durations = []

# Do matmul at TF32 mode.
for step in range(NUM_TEST+WARM_UP):
    torch.cuda.synchronize()
    start = time.time()
    ab_tf32 = a @ b  # takes 0.016s on GA100
    torch.cuda.synchronize()
    end = time.time()
    if step >= WARM_UP:
        durations.append((end - start)*1000)
print(f'TF32 average inference time : {sum(durations)/len(durations)}ms')

# The flag below controls whether to allow TF32 on matmul. This flag defaults to True.
torch.backends.cuda.matmul.allow_tf32 = False
# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
torch.backends.cudnn.allow_tf32 = False

durations = []

# Do matmul at TF32 mode.
for step in range(NUM_TEST+WARM_UP):
    torch.cuda.synchronize()
    start = time.time()
    ab_tf32 = a @ b  # takes 0.016s on GA100
    torch.cuda.synchronize()
    end = time.time()
    if step >= WARM_UP:
        durations.append((end - start)*1000)
print(f'FP32 average inference time : {sum(durations)/len(durations)}ms')

TF32 average inference time : 38.884220123291016ms
FP32 average inference time : 106.4377236366272ms
