## Pytoch performance experiments

In [None]:
!gpustat

In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from time import perf_counter
from collections import defaultdict

###  Performance scaling

Conclusions
- data type `double` is slower that `float`
- calculations on a powerful GPU, such as A100, runs `60` times faster

In [None]:
N = 1000
dtype = torch.float
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device('cpu')

In [None]:
def kernel(inp):
    x = 0
    for _ in range(100):
        x += torch.sin(inp)
    return x

def run(device=device, dtype=dtype, size=N):
    #print(f"size  : {size}")
    #print(f"device: {device}")
    #print(f"dtype : {dtype}")
    start = perf_counter()
    inp = torch.eye(size, requires_grad=True, dtype=dtype)
    inp2 = inp.to(device)
    out = kernel(inp2)
    out.backward(torch.ones_like(inp2), retain_graph=True)
    stop = perf_counter()
    #print(f"Gradient {inp.grad}")
    return stop-start

# %time run()

In [None]:
df = defaultdict(list)
for size in np.logspace(1, 12, 12, base=2, dtype=np.int32):
    print(f"size: {size}")
    df['size'].append(size)
    df['cpu-float'].append( run('cpu'   , torch.float , size))
    df['gpu-float'].append( run('cuda:0', torch.float , size))
    df['cpu-double'].append(run('cpu'   , torch.double, size))
    df['gpu-double'].append(run('cuda:0', torch.double, size))

In [None]:
df = pd.DataFrame(df)
df.plot(x='size', y=['cpu-float', 'gpu-float', 'cpu-double', 'gpu-double'])
plt.ylabel('elapsed time')

df['speedup-float'] = df['cpu-float']/df['gpu-float']
df['speedup-double'] = df['cpu-double']/df['gpu-double']
df.plot.bar(x='size', y=['speedup-double', 'speedup-float'])
plt.axhline(1.0, ls='--', c='k')
plt.ylabel('speed up over gpu')
df.tail()

### Torch script (jit)

Conclusions:
- not much performance improvement is observed using the `torch.jit.script`
- no multi processing is observed by `Pytorch`
- but multi-processing is possible usin `Dask` client
- Dask cannot handle scripted funciton (picke error)
- Dask with GPU client is slow and multi-workers consumes more memory

In [None]:
# @torch.jit.script
def kernel(x, y, device: torch.device):
    r = torch.empty(x.shape).to(device)
    for _ in range(10000):
        if x.max() > y.max():
            r = r + torch.sin(x+y)
        else:
            r = r + torch.cos(x-y)
    return r

@torch.jit.script
def kernel_jit(x, y, device: torch.device):
    return kernel(x, y, device)

print(type(kernel_jit))  # torch.jit.ScriptFunction

# See the compiled graph as Python code
print(kernel_jit.code)

In [None]:
def run(device, dtype, size, kernel):
    #print(f"size  : {size}")
    #print(f"device: {device}")
    #print(f"dtype : {dtype}")
    start = perf_counter()
    inp1 = torch.rand(size, requires_grad=True, dtype=dtype)
    inp2 = torch.rand(size, requires_grad=True, dtype=dtype)
    out = kernel(inp1.to(device), inp2.to(device), device)
    out.backward(torch.ones_like(inp2).to(device), retain_graph=True)
    stop = perf_counter()
    #print(f"Gradient {inp.grad}")
    return stop-start

In [None]:
df = defaultdict(list)
size = 1000
for _ in range(10):
    print(_)
    df['attempt'].append(_+1)
    df['cpu'].append( run('cpu'   , torch.double , size, kernel))
    df['gpu'].append( run('cuda:0', torch.double , size, kernel))
    df['cpu-jit'].append( run('cpu'   , torch.double , size, kernel_jit))
    df['gpu-jit'].append( run('cuda:0', torch.double , size, kernel_jit))

In [None]:
df = pd.DataFrame(df)
# df.plot(x='attempt')
plt.ylabel('elapsed time')
df.mean()

#### Dask client (multi-process)

In [None]:
from dask.distributed import Client, fire_and_forget
from dask_cuda import LocalCUDACluster

client = Client(memory_limit='3GB', n_workers=4, processes=True, threads_per_worker=1, dashboard_address=':8791')

# cluster = LocalCUDACluster(n_workers=1, threads_per_worker=1, dashboard_address=':8791',
#                               memory_limit="auto",
#                               device_memory_limit="auto", # memory spilling
#                               #rmm_pool_size="5GB",
#                               #rmm_managed_memory=True,
#                               #silence_logs=False,
#                               local_directory="/tmp/", 
#                               #enable_nvlink=True,
#                               ) # See https://docs.rapids.ai/api/dask-cuda/nightly/api.html
# client = Client(cluster)

display(client)

#### Future

In [None]:
size=1000
device = torch.device('cpu')
dtype = torch.double

for _ in range(100):
    inp1 = torch.rand(size, requires_grad=True, dtype=dtype)
    inp2 = torch.rand(size, requires_grad=True, dtype=dtype)
    future = client.submit(kernel, inp1.to(device), inp2.to(device), device)
    fire_and_forget(future)

#### Delay

In [None]:
import torch
from dask import delayed

# @torch.jit.script
def fun(x: torch.Tensor) -> torch.Tensor:
    return x

# fn = delayed(fun, pure=False)  # works
fn = delayed(fun, pure=True)  # causes error

In [None]:
# fn(torch.rand(size, requires_grad=True, dtype=dtype)).compute()