In [6]:
import torch, os, sys
from torch import nn

from drct.archs.DRCT_arch import DRCT  # ✅ 여기만 바뀜

model = DRCT(
    img_size=64,
    patch_size=1,
    in_chans=3,
    embed_dim=180,
    depths=(6, 6, 6, 6, 6,6),
    num_heads=(6, 6, 6, 6,6,6),
    window_size=16,
    compress_ratio=3,
    squeeze_factor=30,
    conv_scale=0.01,
    overlap_ratio=0.5,
    mlp_ratio=2.,
    qkv_bias=True,
    qk_scale=None,
    drop_rate=0.,
    attn_drop_rate=0.,
    drop_path_rate=0.1,
    norm_layer=nn.LayerNorm,
    ape=False,
    patch_norm=True,
    use_checkpoint=False,
    upscale=4,
    img_range=1.,
    upsampler='pixelshuffle',
    resi_connection='1conv',
    gc=32
)

print("model ok")


>>> patch_size = 1 img_size = 64 patches_resolution = [64, 64]
model ok


In [7]:
from drct.archs.DRCT_arch import RDG
rdg = RDG(
    dim=180,
    input_resolution=(64, 64),
    depth=0,
    num_heads=6,
    window_size=16,
    shift_size=8,
    mlp_ratio=2.0,
    qkv_bias=True,
    qk_scale=None,
    drop=0.0,
    attn_drop=0.0,
    drop_path=0.0,
    norm_layer=torch.nn.LayerNorm,
    gc=32,
    patch_size=1,
    img_size=64
)


In [12]:
import torch
from torch.profiler import profile, record_function, ProfilerActivity

def profiler_rdg_ops(rdg, B=2, H=64, W=64, C=180, iters=12, warmup=2):
    rdg = rdg.cuda().train()
    x = torch.randn(B, H * W, C, device="cuda")
    xsize = (H, W)
    opt = torch.optim.AdamW(rdg.parameters(), lr=1e-4)

    # (선택) 워밍업: 첫 실행은 커널 준비/캐시 때문에 튈 수 있어서
    for _ in range(warmup):
        opt.zero_grad(set_to_none=True)
        out = rdg(x, xsize)
        loss = out.mean()
        loss.backward()
        opt.step()
    torch.cuda.synchronize()

    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=True,
        with_stack=False
    ) as prof:
        for i in range(iters):
            opt.zero_grad(set_to_none=True)
            with record_function(f"RDG_FORWARD/iter_{i+1}"):
                out = rdg(x, xsize)
            with record_function(f"RDG_BACKWARD/iter_{i+1}"):
                loss = out.mean()
                loss.backward()
            opt.step()

    # ✅ 여기 표의 cuda_time_total은 "iters번 누적 합"이야.
    print(
    prof.key_averages()
        .table(sort_by="cuda_time_total", row_limit=200)
)

    # (추가) iter당 평균 ms도 같이 보고 싶으면:
    avg = prof.key_averages()
    # 표에서 직접 보기 편하게 하려면 row_limit을 늘리거나,
    # RDG_FORWARD 같은 키만 필터해서 나눠도 됨.


In [13]:
profiler_rdg_ops(rdg, B=2, H=64, W=64, C=180)


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
    autograd::engine::evaluate_function: AddmmBackward0         0.51%       3.179ms         5.36%      33.500ms     139.583us       0.000us         0.00%     103.783ms     432.429us           0 b           0 b      -2.37 Gb      -4.53 G

In [14]:
import torch
from torch.profiler import profile, record_function, ProfilerActivity

def profiler_rdg_ops(rdg, B=2, H=64, W=64, C=180, warmup=2, iters=5):
    rdg = rdg.cuda().train()
    x = torch.randn(B, H*W, C, device="cuda")
    xsize = (H, W)
    opt = torch.optim.AdamW(rdg.parameters(), lr=1e-4)

    # ✅ 워밍업(커널 캐시/튜닝 때문에 첫 실행이 튀는 것 방지)
    for _ in range(warmup):
        opt.zero_grad(set_to_none=True)
        out = rdg(x, xsize)
        out.mean().backward()
        opt.step()
    torch.cuda.synchronize()

    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=False,
        with_stack=False
    ) as prof:
        for _ in range(iters):
            opt.zero_grad(set_to_none=True)
            with record_function("RDG_FORWARD"):
                out = rdg(x, xsize)
            with record_function("RDG_BACKWARD"):
                loss = out.mean()
                loss.backward()
            opt.step()
        torch.cuda.synchronize()

    # ✅ 1) 전체 표(크게 보고 싶으면)
    # print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=200))

    # ✅ 2) "네가 붙인 라벨(RDG/...)"만 자동으로 뽑아서 보여주기
    ka = prof.key_averages()
    tagged = [e for e in ka if "RDG/" in e.key or "RDG_FORWARD" in e.key or "RDG_BACKWARD" in e.key]

    tagged = sorted(tagged, key=lambda e: e.cuda_time_total, reverse=True)

    print("\n===== TAGGED EVENTS (RDG/*) sorted by cuda_time_total =====")
    for e in tagged[:50]:
        print(f"{e.key:25s} | cuda_total={e.cuda_time_total/1000:.3f} ms | cpu_total={e.cpu_time_total/1000:.3f} ms | calls={e.count}")


In [15]:
profiler_rdg_ops(rdg, B=2, H=64, W=64, C=180)



===== TAGGED EVENTS (RDG/*) sorted by cuda_time_total =====
RDG_FORWARD               | cuda_total=59.142 ms | cpu_total=60.054 ms | calls=5
RDG_BACKWARD              | cuda_total=0.097 ms | cpu_total=88.134 ms | calls=5
