## Pytoch performance experiments

In [None]:
!gpustat

In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from time import perf_counter
from collections import defaultdict

###  Performance scaling

Observations:
- data type `double` is slower that `float`
- calculations on a powerful GPU, such as A100, runs `60` times faster

In [None]:
N = 1000
dtype = torch.float
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device('cpu')

In [None]:
def kernel(inp):
    x = 0
    for _ in range(100):
        x += torch.sin(inp)
    return x

def run(device=device, dtype=dtype, size=N):
    #print(f"size  : {size}")
    #print(f"device: {device}")
    #print(f"dtype : {dtype}")
    start = perf_counter()
    inp = torch.eye(size, requires_grad=True, dtype=dtype)
    inp2 = inp.to(device)
    out = kernel(inp2)
    out.backward(torch.ones_like(inp2), retain_graph=True)
    stop = perf_counter()
    #print(f"Gradient {inp.grad}")
    return stop-start

# %time run()

In [None]:
df = defaultdict(list)
for size in np.logspace(1, 12, 12, base=2, dtype=np.int32):
    print(f"size: {size}")
    df['size'].append(size)
    df['cpu-float'].append( run('cpu'   , torch.float , size))
    df['gpu-float'].append( run('cuda:0', torch.float , size))
    df['cpu-double'].append(run('cpu'   , torch.double, size))
    df['gpu-double'].append(run('cuda:0', torch.double, size))

In [None]:
df = pd.DataFrame(df)
df.plot(x='size', y=['cpu-float', 'gpu-float', 'cpu-double', 'gpu-double'])
plt.ylabel('elapsed time')

df['speedup-float'] = df['cpu-float']/df['gpu-float']
df['speedup-double'] = df['cpu-double']/df['gpu-double']
df.plot.bar(x='size', y=['speedup-double', 'speedup-float'])
plt.axhline(1.0, ls='--', c='k')
plt.ylabel('speed up over gpu')
df.tail()

### Torch script (jit)

Observations:
- not much performance improvement is observed using the `torch.jit.script`
- no multi processing is observed by `Pytorch`
- but multi-processing is possible usin `Dask` client
- Dask cannot handle scripted funciton (picke error)
- Dask with GPU client is slow and multi-workers consumes more memory

In [None]:
# @torch.jit.script
def kernel(x, y, device: torch.device):
    r = torch.empty(x.shape).to(device)
    for _ in range(10000):
        if x.max() > y.max():
            r = r + torch.sin(x+y)
        else:
            r = r + torch.cos(x-y)
    return r

@torch.jit.script
def kernel_jit(x, y, device: torch.device):
    return kernel(x, y, device)

print(type(kernel_jit))  # torch.jit.ScriptFunction

# See the compiled graph as Python code
print(kernel_jit.code)

In [None]:
def run(device, dtype, size, kernel):
    #print(f"size  : {size}")
    #print(f"device: {device}")
    #print(f"dtype : {dtype}")
    start = perf_counter()
    inp1 = torch.rand(size, requires_grad=True, dtype=dtype)
    inp2 = torch.rand(size, requires_grad=True, dtype=dtype)
    out = kernel(inp1.to(device), inp2.to(device), device)
    out.backward(torch.ones_like(inp2).to(device), retain_graph=True)
    stop = perf_counter()
    #print(f"Gradient {inp.grad}")
    return stop-start

In [None]:
df = defaultdict(list)
size = 1000
for _ in range(10):
    print(_)
    df['attempt'].append(_+1)
    df['cpu'].append( run('cpu'   , torch.double , size, kernel))
    df['gpu'].append( run('cuda:0', torch.double , size, kernel))
    df['cpu-jit'].append( run('cpu'   , torch.double , size, kernel_jit))
    df['gpu-jit'].append( run('cuda:0', torch.double , size, kernel_jit))

In [None]:
df = pd.DataFrame(df)
# df.plot(x='attempt')
plt.ylabel('elapsed time')
df.mean()

#### Dask client (multi-process)

In [None]:
from dask.distributed import Client, fire_and_forget
from dask_cuda import LocalCUDACluster

client = Client(memory_limit='3GB', n_workers=4, processes=True, threads_per_worker=1, dashboard_address=':8791')

# cluster = LocalCUDACluster(n_workers=1, threads_per_worker=1, dashboard_address=':8791',
#                               memory_limit="auto",
#                               device_memory_limit="auto", # memory spilling
#                               #rmm_pool_size="5GB",
#                               #rmm_managed_memory=True,
#                               #silence_logs=False,
#                               local_directory="/tmp/", 
#                               #enable_nvlink=True,
#                               ) # See https://docs.rapids.ai/api/dask-cuda/nightly/api.html
# client = Client(cluster)

display(client)

#### Future: tensors

In [None]:
size=1000
device = torch.device('cpu')
dtype = torch.double

for _ in range(100):
    inp1 = torch.rand(size, requires_grad=True, dtype=dtype)
    inp2 = torch.rand(size, requires_grad=True, dtype=dtype)
    future = client.submit(kernel, inp1.to(device), inp2.to(device), device)
    fire_and_forget(future)

#### Future: structures

Observations:
- Dask cannot directly parallize a kernel with generic input class such as Structure.
- As a solution, the kernel's inputs have to be translated in form of arrays or tensors. 
- Also defining a Kernel class which takes care of unnecessary inputs are very useful and an elegant design. 

In [None]:
import sys
sys.path.append('../')

import torchip as tp
from torchip import logger
from torchip.datasets import RunnerStructureDataset, ToStructure
from torchip.potentials import NeuralNetworkPotential

tp.device.DEVICE = "cpu"

import torch
import time
from pathlib import Path

In [None]:
potdir = Path("../examples/LJ")

structures = RunnerStructureDataset(Path(potdir, "input.data"), persist=True) 
structure0 = structures[4]

nnp = NeuralNetworkPotential(Path(potdir, "input.nn"))
descriptor = nnp.descriptor["Ne"]
scaler = nnp.scaler["Ne"]

In [None]:
# structure0.calculate_distance(aid=0, neighbors=1, detach=False, return_diff=True)

In [None]:
from torch import Tensor

class Box:
    def __init__(self, lattice):
        self.lattice = lattice
    
    @staticmethod
    def _apply_pbc(dx, lat):
        for i in range(3):
            l = lat[i, i]
            dx[..., i] = torch.where(dx[..., i] >  0.5E0*l, dx[..., i] - l, dx[..., i])
            dx[..., i] = torch.where(dx[..., i] < -0.5E0*l, dx[..., i] + l, dx[..., i])
        return dx
    
    def apply_pbc(self, dx):
        return Box._apply_pbc(dx, self.lattice)
        

class Structure_:
    
    @staticmethod
    def _calculate_distance(
            pos: Tensor,
            aid: int, 
            lat: Tensor = None,
            detach: bool = False, 
            neighbors = None, 
            difference: bool = False
        ) -> Tensor: # TODO: also tuple?
        """
        This method calculates an array of distances of all atoms existing in the structure from an input atom. 
        TODO: input pbc flag, using default pbc from global configuration
        TODO: also see torch.cdist
        """   
        x = pos.detach() if detach else pos
        x = x[neighbors] if neighbors else x 
        x = torch.unsqueeze(x, dim=0) if x.ndim == 1 else x  # for when neighbors index is only a number
        dx = pos[aid] - x  # FIXME: detach?

        # Apply PBC along x,y,and z directions if lattice info is provided 
        if lat is not None:
            dx = Box._apply_pbc(dx, lat) # using broadcasting

        # Calculate distance from dx tensor
        distance = torch.linalg.vector_norm(dx, dim=1)

        return distance if not difference else (distance, dx)


In [None]:
# from torchip.structure.box import Box

class Kernel:
    def __init__(self, func, dist, pbc):
        self.func = func
        self.dist = dist
        self.pbc = pbc
        

    def __call__(self, x, at, dtype=None, device=None, emap=None, lat=None):
        for i in range(10000):
            self.func(x)
            self.func(at)
        if emap:
            emap[int(at[0])]
        if self.dist:
            dx = self.dist(x, aid=0, neighbors=1)
            print(dx)
        if lat:
            self.pbc(dx, lat)
        # time.sleep(0.1)

kernel = Kernel(torch.max, dist=Structure_._calculate_distance, pbc=Box._apply_pbc)
for structure in structures:
    tensors = [structure.position, structure.atype]
    params = {
        'dtype': torch.double, 
        'device': 'cpu', 
        'emap': structure.element_map.atype_to_element,
        'lat': structure.box.lattice if structure.box else None,
    }
    # client.scatter(tensors, broadcast=True)  
    future = client.submit(kernel, *tensors, **params)
    fire_and_forget(future)

#### Delay

In [None]:
import torch
from dask import delayed

# @torch.jit.script
def fun(x: torch.Tensor) -> torch.Tensor:
    return x

# fn = delayed(fun, pure=False)  # works
fn = delayed(fun, pure=True)  # causes error

In [None]:
# fn(torch.rand(size, requires_grad=True, dtype=dtype)).compute()

## Torch scalar

Observations:
- the torch scalar is slighly faster than the generic python scalar for some cases.
- it's expected that this difference increases as having more scalar vs. tensor operations specifically on GPU

In [None]:
import torch

In [None]:
size=10000
device = torch.device('cuda')
dtype = torch.double

In [None]:
x = torch.rand(size, dtype=dtype, device=device, requires_grad=True)
st = torch.tensor(4, dtype=dtype, device=device )
s  = 4

In [None]:
def kernel(x, s):
    for _ in range(5000):
        x = torch.where(torch.sin(x) < 0.5, s*x, s*x)
    return x

In [None]:
def profile(kernel, *args, **kwargs):
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ]
    ) as p:
        kernel(*args, **kwargs)
        
    print(p.key_averages().table(
        sort_by="self_cuda_time_total", row_limit=-1))

In [None]:
# %%time
profile(kernel, x, st)

In [None]:
# %%time
# profile(kernel, x, s)

## Pin memory

A relevant link: https://spell.ml/blog/pytorch-training-tricks-YAnJqBEAACkARhgD

Observations:
- not much difference when pin_memory flag is activated
- multi processing has to be activate with at least 4 workers
- pin mmeory has to be taken account when host-device memcopy is the computational bottle neck

In [None]:
import torchvision, torch, time
import numpy as np
 
pin_memory = False

batch_size = 4098 # bigger memory transfers to make their cost more noticable
n_workers = 4 # parallel workers to free up the main thread and reduce data decoding overhead

train_dataset =torchvision.datasets.CIFAR10(
    root='cifar10_pytorch',
    download=True,
    transform=torchvision.transforms.ToTensor()
)   
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    pin_memory=pin_memory,
    num_workers=n_workers
)   
print('pin_memory:', pin_memory)
times = []
n_runs = 5

def work():
    # emulates the CPU work done
    time.sleep(0.01)

for i in range(n_runs):
    st = time.time()
    for bx, by in train_dataloader:
       bx, by = bx.cuda(non_blocking=pin_memory), by.cuda(non_blocking=pin_memory)
       work()
    times.append(time.time() - st)
print('average time:', np.mean(times))