In [1]:
import torch
import torch.nn as nn

In [2]:
batch_size = 10
input_dim = 1024
output_dim = 1024
num_experts = 16

x = torch.randn(batch_size, input_dim).cuda()

In [3]:
from moe_v1_basic import MoE

moe1 = MoE(input_dim, output_dim, num_experts).cuda()
output = moe1(x)
output.shape, output

(torch.Size([10, 1024]),
 tensor([[-0.2058,  0.0986,  0.1500,  ...,  0.0827, -0.3624,  0.2859],
         [-0.1283, -0.0538,  0.1200,  ...,  0.0201, -0.3653, -0.0725],
         [ 0.3320, -0.1554,  0.0751,  ...,  0.2465, -0.0369, -0.1451],
         ...,
         [ 0.2430,  0.3744,  0.1355,  ..., -0.0760,  0.1077,  0.0744],
         [-0.2925, -0.0213,  0.1528,  ..., -0.0856, -0.0333,  0.0052],
         [ 0.0328, -0.1962, -0.1069,  ...,  0.0845,  0.0808,  0.0367]],
        device='cuda:0', grad_fn=<SumBackward1>))

In [4]:
from moe_v2_toSparse import MoE

moe2 = MoE(input_dim, output_dim, num_experts).cuda()
output = moe2(x)
output.shape, output

(torch.Size([10, 1024]),
 tensor([[ 0.4440,  0.2380,  0.1422,  ...,  0.0591, -0.1180,  0.1894],
         [ 1.4021,  0.1073, -0.6160,  ..., -0.4645, -0.0168,  0.0389],
         [ 0.2732, -0.0975, -0.3064,  ...,  1.7107,  0.7109,  0.4052],
         ...,
         [-0.5975,  0.5013, -0.0268,  ..., -1.1373,  0.1182, -0.0534],
         [-0.0066,  0.2181,  0.2037,  ..., -0.4815,  0.6203, -0.1041],
         [-0.2771, -0.6388,  0.1856,  ...,  0.9925, -0.0381,  0.1130]],
        device='cuda:0', grad_fn=<StackBackward0>))

In [7]:
from moe_v2_toSparse2 import MoE

topk = 2

moe3 = MoE(input_dim, output_dim, num_experts, topk).cuda()
output = moe3(x)
output.shape, output

(torch.Size([10, 1024]),
 tensor([[-0.0875, -0.0932, -0.0056,  ...,  0.1254, -0.0085,  0.2034],
         [-0.1713,  0.0212,  0.0275,  ...,  0.0050, -0.1049,  0.1096],
         [ 0.0215,  0.0418,  0.0030,  ..., -0.0907,  0.0037,  0.0565],
         ...,
         [ 0.0845, -0.1321,  0.0945,  ...,  0.1759,  0.1503, -0.0669],
         [ 0.0948,  0.0117, -0.1164,  ...,  0.1025, -0.0624, -0.1522],
         [-0.0511,  0.1141, -0.1445,  ...,  0.0728, -0.0342,  0.0680]],
        device='cuda:0', grad_fn=<SumBackward1>))

In [9]:
# %%timeit

# moe1(x)

170 μs ± 2.87 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [10]:
# %%timeit

# moe3(x)

234 μs ± 612 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [7]:
from torch.profiler import profile, record_function, ProfilerActivity

def profile_model(model, input_tensor, profiler, label):
    with record_function(label):
        output = model(input_tensor)
    return output

In [8]:
dense_moe = moe1.cuda()
sparse_moe = moe3.cuda()
x = x.cuda()

# 프로파일러 설정
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(
        wait=1,
        warmup=1,
        active=3
    ),
    on_trace_ready=torch.profiler.tensorboard_trace_handler("./log/profiler"),
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    for step in range(5):
        if step == 0:
            # 첫 번째 스텝은 wait
            pass
        elif step == 1:
            # 두 번째 스텝은 warmup
            pass
        elif step >= 2:
            # 세 번째 이후 스텝은 active
            profile_model(dense_moe, x, prof, "DenseMoE Forward")
            profile_model(sparse_moe, x, prof, "SparseMoE Forward")
        prof.step()

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=50))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                      SparseMoE Forward         0.00%       0.000us         0.00%       0.000us       0.000us       1.449ms        42.22%       1.449ms     482.839us           0 b           0 b           0 b           0 

In [1]:
!tensorboard --logdir=./log/profiler/

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.18.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C
