In [1]:
import time
import torch
torch.set_num_threads(1)
from clebsch_gordan import get_real_clebsch_gordan, ClebschGordan
from sparse_accumulation_plain_torch import sparse_accumulation_loops, sparse_accumulation_index_add
import sparse_accumulation, sparse_accumulation_active_dim_first,  sparse_accumulation_active_dim_middle
import numpy as np

In [2]:
L_MAX = 8
clebsch = ClebschGordan(L_MAX).precomputed_
indices = get_real_clebsch_gordan(clebsch[L_MAX, L_MAX, L_MAX], L_MAX, L_MAX, L_MAX)

In [3]:
m1_aligned, m2_aligned = [], []
multipliers, mu_aligned = [], []
for mu in range(0, 2 * L_MAX + 1):
    for el in indices[mu]:
        m1, m2, multiplier = el
        m1_aligned.append(m1)
        m2_aligned.append(m2)
        multipliers.append(multiplier)
        mu_aligned.append(mu)
m1_aligned = torch.LongTensor(m1_aligned)
m2_aligned = torch.LongTensor(m2_aligned)
mu_aligned = torch.LongTensor(mu_aligned)
multipliers = torch.FloatTensor(multipliers)

In [4]:
def get_input(BATCH_SIZE, N_FEATURES, active_dim, device):
    if active_dim == 0:
        X1 = torch.randn(2 * L_MAX + 1, BATCH_SIZE, N_FEATURES, device = device)
        X2 = torch.randn(2 * L_MAX + 1, BATCH_SIZE, N_FEATURES, device = device)
    
    if active_dim == 1:
        X1 = torch.randn(BATCH_SIZE, 2 * L_MAX + 1, N_FEATURES, device = device)
        X2 = torch.randn(BATCH_SIZE, 2 * L_MAX + 1, N_FEATURES, device = device)
        
    if active_dim == 2:
        X1 = torch.randn(BATCH_SIZE, N_FEATURES, 2 * L_MAX + 1, device = device)
        X2 = torch.randn(BATCH_SIZE, N_FEATURES, 2 * L_MAX + 1, device = device)   
   
        
    if (active_dim != 0) and (active_dim != 2) and (active_dim != 1):
        raise ValueError("active dim should be one of 0, 1, 2")
        
    return X1, X2

def benchmark_forward_cpu(BATCH_SIZE, N_FEATURES, active_dim, function, n_trials):
    X1, X2 = get_input(BATCH_SIZE, N_FEATURES, active_dim, 'cpu')
    times = []
            
    for _ in range(n_trials):
        begin = time.time() 
        output = function(X1, X2, mu_aligned, 2 * L_MAX + 1, m1_aligned, m2_aligned, multipliers)
        times.append(time.time() - begin)
    return times


def benchmark_forward_gpu(BATCH_SIZE, N_FEATURES, active_dim, function, n_trials):
    X1, X2 = get_input(BATCH_SIZE, N_FEATURES, active_dim, 'cuda')
    times = []
    torch.cuda.synchronize('cuda')
    starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(
        enable_timing=True)
    
    for _ in range(n_trials):
        starter.record()
        output = function(X1, X2, mu_aligned, 2 * L_MAX + 1, m1_aligned, m2_aligned, multipliers)
        ender.record()
        torch.cuda.synchronize('cuda')
        delta_time = starter.elapsed_time(ender)
        times.append(delta_time / 1000.0)        
    return times


def benchmark_backward_cpu(BATCH_SIZE, N_FEATURES, active_dim, function, n_trials):
    X1, X2 = get_input(BATCH_SIZE, N_FEATURES, active_dim, 'cpu')
        
    X1.requires_grad = True
    X2.requires_grad = True
    times = []
    for _ in range(n_trials):
        begin = time.time()
        output = function(X1, X2, mu_aligned, 2 * L_MAX + 1, m1_aligned, m2_aligned, multipliers)
        output.backward(gradient=torch.ones_like(output))
        times.append(time.time() - begin)
    return np.array(times)

def benchmark_backward_gpu(BATCH_SIZE, N_FEATURES, active_dim, function, n_trials):
    X1, X2 = get_input(BATCH_SIZE, N_FEATURES, active_dim, 'cuda')
        
    X1.requires_grad = True
    X2.requires_grad = True
    times = []
    
    torch.cuda.synchronize('cuda')
    starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(
        enable_timing=True)    
    
    for _ in range(n_trials):
        starter.record()
        output = function(X1, X2, mu_aligned, 2 * L_MAX + 1, m1_aligned, m2_aligned, multipliers)
        output.backward(gradient=torch.ones_like(output))
        ender.record()
        torch.cuda.synchronize('cuda')
        delta_time = starter.elapsed_time(ender)
        times.append(delta_time / 1000.0)  
    return np.array(times)

In [5]:
def get_func_fixed_dim(func, active_dim):
    def func_fixed_dim(*args):
        return func(*args, active_dim = active_dim)
    return func_fixed_dim

# cpu

In [6]:
BATCH_SIZE = 1000
N_FEATURES = 100

times = benchmark_forward_cpu(BATCH_SIZE, N_FEATURES, 0, 
                          get_func_fixed_dim(sparse_accumulation_loops, 0), 10)
print("python loops; active dim 0; forward: ", np.mean(times[1:]))
times = benchmark_forward_cpu(BATCH_SIZE, N_FEATURES, 0, 
                          get_func_fixed_dim(sparse_accumulation_index_add, 0), 10)
print("torch index_add_; active dim 0; forward: ", np.mean(times[1:]))
times = benchmark_forward_cpu(BATCH_SIZE, N_FEATURES, 0,
                          sparse_accumulation_active_dim_first.SparseAccumulationActiveDimFirst.apply, 10)
print("cpp; active dim 0; forward: ", np.mean(times[1:]))

print()
times = benchmark_forward_cpu(BATCH_SIZE, N_FEATURES, 1, 
                          get_func_fixed_dim(sparse_accumulation_loops, 1), 10)
print("python loops; active dim 1; forward: ", np.mean(times[1:]))
times = benchmark_forward_cpu(BATCH_SIZE, N_FEATURES, 1, 
                          get_func_fixed_dim(sparse_accumulation_index_add, 1), 10)
print("torch index_add_; active dim 1; forward: ", np.mean(times[1:]))
times = benchmark_forward_cpu(BATCH_SIZE, N_FEATURES, 1,
                          sparse_accumulation_active_dim_middle.SparseAccumulationActiveDimMiddle.apply, 10)
print("cpp; active dim 1; forward: ", np.mean(times[1:]))

print()
times = benchmark_forward_cpu(BATCH_SIZE, N_FEATURES, 2, get_func_fixed_dim(sparse_accumulation_loops, 2), 10)
print("python loops; active dim 2; forward: ", np.mean(times[1:]))
times = benchmark_forward_cpu(BATCH_SIZE, N_FEATURES, 2, get_func_fixed_dim(sparse_accumulation_index_add, 2), 10)
print("torch index_add_; active dim 2; forward: ", np.mean(times[1:]))
times = benchmark_forward_cpu(BATCH_SIZE, N_FEATURES, 2, sparse_accumulation.SparseAccumulation.apply, 10)
print("cpp; active dim 2; forward: ", np.mean(times[1:]))



python loops; active dim 0; forward:  0.018164555231730144
torch index_add_; active dim 0; forward:  0.1441103087531196
cpp; active dim 0; forward:  0.018260796864827473

python loops; active dim 1; forward:  0.02665122350056966
torch index_add_; active dim 1; forward:  0.16138855616251627
cpp; active dim 1; forward:  0.020249154832628038

python loops; active dim 2; forward:  0.11550572183397081
torch index_add_; active dim 2; forward:  0.3303320672776964
cpp; active dim 2; forward:  0.07571329010857476


In [7]:
times = benchmark_backward_cpu(BATCH_SIZE, N_FEATURES, 0, 
                           get_func_fixed_dim(sparse_accumulation_loops, 0), 10)
print("python loops; active dim 0; backward: ", np.mean(times[1:]))
times = benchmark_backward_cpu(BATCH_SIZE, N_FEATURES, 0, 
                           get_func_fixed_dim(sparse_accumulation_index_add, 0), 10)
print("torch index_add_; active dim 0; backward: ", np.mean(times[1:]))
times = benchmark_backward_cpu(BATCH_SIZE, N_FEATURES, 0,
                           sparse_accumulation_active_dim_first.SparseAccumulationActiveDimFirst.apply, 10)
print("cpp; active dim 0; backward: ", np.mean(times[1:]))

print()

times = benchmark_backward_cpu(BATCH_SIZE, N_FEATURES, 1, 
                           get_func_fixed_dim(sparse_accumulation_loops, 1), 10)
print("python loops; active dim 1; backward: ", np.mean(times[1:]))
times = benchmark_backward_cpu(BATCH_SIZE, N_FEATURES, 1, 
                           get_func_fixed_dim(sparse_accumulation_index_add, 1), 10)
print("torch index_add_; active dim 1; backward: ", np.mean(times[1:]))
times = benchmark_backward_cpu(BATCH_SIZE, N_FEATURES, 1,
                           sparse_accumulation_active_dim_middle.SparseAccumulationActiveDimMiddle.apply, 10)
print("cpp; active dim 1; backward: ", np.mean(times[1:]))


print()
times = benchmark_backward_cpu(BATCH_SIZE, N_FEATURES, 2, 
                           get_func_fixed_dim(sparse_accumulation_index_add, 2), 10)
print("python loops; active dim 2; backward: ", np.mean(times[1:]))
times = benchmark_backward_cpu(BATCH_SIZE, N_FEATURES, 2, 
                           get_func_fixed_dim(sparse_accumulation_index_add, 2), 10)
print("torch index_add_; active dim 2; backward: ", np.mean(times[1:]))
times = benchmark_backward_cpu(BATCH_SIZE, N_FEATURES, 2, sparse_accumulation.SparseAccumulation.apply, 10)
print("cpp; active dim 2; backward: ", np.mean(times[1:]))





python loops; active dim 0; backward:  1.151155710220337
torch index_add_; active dim 0; backward:  0.28337030940585667
cpp; active dim 0; backward:  0.04540822241041395

python loops; active dim 1; backward:  1.5681321620941162
torch index_add_; active dim 1; backward:  0.28336673312717015
cpp; active dim 1; backward:  0.04901160134209527

python loops; active dim 2; backward:  0.6322691175672743
torch index_add_; active dim 2; backward:  0.6292437977261014
cpp; active dim 2; backward:  0.12261679437425402


# CUDA

In [8]:
m1_aligned = m1_aligned.cuda() 
m2_aligned = m2_aligned.cuda()
mu_aligned = mu_aligned.cuda()
multipliers = multipliers.cuda()

In [9]:
BATCH_SIZE = 1000
N_FEATURES = 100

times = benchmark_forward_gpu(BATCH_SIZE, N_FEATURES, 0, 
                          get_func_fixed_dim(sparse_accumulation_loops, 0), 10)
print("python loops; active dim 0; forward; cuda: ", np.mean(times[1:]))
times = benchmark_forward_gpu(BATCH_SIZE, N_FEATURES, 0, 
                          get_func_fixed_dim(sparse_accumulation_index_add, 0), 10)
print("torch index_add_; active dim 0; forward; cuda: ", np.mean(times[1:]))
'''times = benchmark_forward(BATCH_SIZE, N_FEATURES, 0,
                          sparse_accumulation_active_dim_first.SparseAccumulationActiveDimFirst.apply, 10)
print("cpp; active dim 0; forward: ", np.mean(times[1:]))'''

print()
times = benchmark_forward_gpu(BATCH_SIZE, N_FEATURES, 1, 
                          get_func_fixed_dim(sparse_accumulation_loops, 1), 10)
print("python loops; active dim 1; forward; cuda: ", np.mean(times[1:]))
times = benchmark_forward_gpu(BATCH_SIZE, N_FEATURES, 1, 
                          get_func_fixed_dim(sparse_accumulation_index_add, 1), 10)
print("torch index_add_; active dim 1; forward; cuda: ", np.mean(times[1:]))
'''times = benchmark_forward(BATCH_SIZE, N_FEATURES, 1,
                          sparse_accumulation_active_dim_middle.SparseAccumulationActiveDimMiddle.apply, 10)
print("cpp; active dim 1; forward: ", np.mean(times[1:]))'''

print()
times = benchmark_forward_gpu(BATCH_SIZE, N_FEATURES, 2, 
                          get_func_fixed_dim(sparse_accumulation_loops, 2), 10)
print("python loops; active dim 2; forward; cuda: ", np.mean(times[1:]))
times = benchmark_forward_gpu(BATCH_SIZE, N_FEATURES, 2,
                          get_func_fixed_dim(sparse_accumulation_index_add, 2), 10)
print("torch index_add_; active dim 2; forward; cuda: ", np.mean(times[1:]))
'''times = benchmark_forward(BATCH_SIZE, N_FEATURES, 2, sparse_accumulation.SparseAccumulation.apply, 10)
print("cpp; active dim 2; forward: ", np.mean(times[1:]))'''



python loops; active dim 0; forward; cuda:  0.01946451907687717
torch index_add_; active dim 0; forward; cuda:  0.0023149653010898167

python loops; active dim 1; forward; cuda:  0.02190663104587131
torch index_add_; active dim 1; forward; cuda:  0.0021864213148752847

python loops; active dim 2; forward; cuda:  0.03215370983547635
torch index_add_; active dim 2; forward; cuda:  0.002321347581015693


'times = benchmark_forward(BATCH_SIZE, N_FEATURES, 2, sparse_accumulation.SparseAccumulation.apply, 10)\nprint("cpp; active dim 2; forward: ", np.mean(times[1:]))'

In [10]:
times = benchmark_backward_gpu(BATCH_SIZE, N_FEATURES, 0, 
                           get_func_fixed_dim(sparse_accumulation_loops, 0), 10)
print("python loops; active dim 0; backward; cuda: ", np.mean(times[1:]))
times = benchmark_backward_gpu(BATCH_SIZE, N_FEATURES, 0, 
                           get_func_fixed_dim(sparse_accumulation_index_add, 0), 10)
print("torch index_add_; active dim 0; backward; cuda: ", np.mean(times[1:]))
'''times = benchmark_backward(BATCH_SIZE, N_FEATURES, 0,
                           sparse_accumulation_active_dim_first.SparseAccumulationActiveDimFirst.apply, 10)
print("cpp; active dim 0; backward: ", np.mean(times[1:]))'''

print()

times = benchmark_backward_gpu(BATCH_SIZE, N_FEATURES, 1, 
                           get_func_fixed_dim(sparse_accumulation_loops, 1), 10)
print("python loops; active dim 1; backward; cuda: ", np.mean(times[1:]))
times = benchmark_backward_gpu(BATCH_SIZE, N_FEATURES, 1, 
                           get_func_fixed_dim(sparse_accumulation_index_add, 1), 10)
print("torch index_add_; active dim 1; backward; cuda: ", np.mean(times[1:]))
'''times = benchmark_backward(BATCH_SIZE, N_FEATURES, 1,
                           sparse_accumulation_active_dim_middle.SparseAccumulationActiveDimMiddle.apply, 10)
print("cpp; active dim 1; backward: ", np.mean(times[1:]))'''


print()
times = benchmark_backward_gpu(BATCH_SIZE, N_FEATURES, 2, 
                           get_func_fixed_dim(sparse_accumulation_index_add, 2), 10)
print("python loops; active dim 2; backward; cuda: ", np.mean(times[1:]))
times = benchmark_backward_gpu(BATCH_SIZE, N_FEATURES, 2, 
                           get_func_fixed_dim(sparse_accumulation_index_add, 2), 10)
print("torch index_add_; active dim 2; backward; cuda: ", np.mean(times[1:]))
'''times = benchmark_backward(BATCH_SIZE, N_FEATURES, 2, sparse_accumulation.SparseAccumulation.apply, 10)
print("cpp; active dim 2; backward: ", np.mean(times[1:]))'''





python loops; active dim 0; backward; cuda:  0.10589237891303167
torch index_add_; active dim 0; backward; cuda:  0.0051896283891465925

python loops; active dim 1; backward; cuda:  0.13309243265787762
torch index_add_; active dim 1; backward; cuda:  0.0052041279474894205

python loops; active dim 2; backward; cuda:  0.040250610351562494
torch index_add_; active dim 2; backward; cuda:  0.04206680509779189


'times = benchmark_backward(BATCH_SIZE, N_FEATURES, 2, sparse_accumulation.SparseAccumulation.apply, 10)\nprint("cpp; active dim 2; backward: ", np.mean(times[1:]))'