In [34]:
import torch
import fvcore
import basicsr
import sys

print("python:", sys.version)
print("torch:", torch.__version__)
print("fvcore:", fvcore.__version__)
print("basicsr:", basicsr.__version__ if hasattr(basicsr, "__version__") else "unknown")
print("cuda available:", torch.cuda.is_available())
print("cuda:", torch.version.cuda)
print("device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "cpu")


python: 3.8.20 (default, Oct  3 2024, 15:24:27) 
[GCC 11.2.0]
torch: 1.12.1
fvcore: 0.1.5.post20221221
basicsr: 1.3.4.9
cuda available: True
cuda: 11.3
device: NVIDIA GeForce RTX 2080 Ti


In [1]:
import torch
from torch import nn

from drct.archs.DrctPlusCA_arch import DRCTPLUSCA   # ✅ 여기!


model = DRCTPLUSCA(
    img_size=64,
    patch_size=1,
    in_chans=3,
    embed_dim=180,
    depths=(6, 6, 6, 6,6,6),
    num_heads=(6, 6, 6, 6,6,6),
    window_size=16,
    compress_ratio=3,
    squeeze_factor=30,
    conv_scale=0.01,
    overlap_ratio=0.5,
    mlp_ratio=2.,
    qkv_bias=True,
    qk_scale=None,
    drop_rate=0.,
    attn_drop_rate=0.,
    drop_path_rate=0.1,
    norm_layer=nn.LayerNorm,
    ape=False,
    patch_norm=True,
    use_checkpoint=False,
    upscale=4,
    img_range=1.,
    upsampler='pixelshuffle',
    resi_connection='1conv',
    gc=32
)


  from .autonotebook import tqdm as notebook_tqdm
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


>>> patch_size = 1 img_size = 64 patches_resolution = [64, 64]


In [2]:
from fvcore.nn import FlopCountAnalysis, parameter_count_table

# ✅ 모델 입력(더미 입력) 크기 정의
input_tensor = torch.randn(1, 3, 64, 64)

# ✅ FLOPs(부동소수점 연산량) 계산
# flop_count = FlopCountAnalysis(model, input_tensor)
# flops = flop_count.total()

# ✅ 파라미터 수 계산
# - parameter_count_table(model): 레이어별 파라미터 수를 보기 좋게 표 형태로 출력
# - params: 전체 파라미터 수(직접 합산)
params_table = parameter_count_table(model)
params = sum(p.numel() for p in model.parameters())

# ✅ Multi-Adds(MACs) 및 forward 횟수 설정
# - 관례적으로 1 MAC ≈ 2 FLOPs(곱 1 + 덧셈 1)로 보는 경우가 많아서 FLOPs/2로 계산
# multi_adds = flops / 2
forward_passes = 1  # 더미 입력으로 forward를 몇 번 계산했는지(여기서는 1번)

# ✅ 결과 출력
print(params_table)
print(f"Params: {params}")
# print(f"FLOPs: {flops}")
# print(f"Multi-Adds: {multi_adds}")
print(f"Forward Passes: {forward_passes}")


| name                             | #elements or shape   |
|:---------------------------------|:---------------------|
| model                            | 25.3M                |
|  conv_first                      |  5.0K                |
|   conv_first.weight              |   (180, 3, 3, 3)     |
|   conv_first.bias                |   (180,)             |
|  patch_embed                     |  0.4K                |
|   patch_embed.norm               |   0.4K               |
|    patch_embed.norm.weight       |    (180,)            |
|    patch_embed.norm.bias         |    (180,)            |
|  layers                          |  24.6M               |
|   layers.0                       |   4.1M               |
|    layers.0.swin1                |    0.5M              |
|    layers.0.adjust1              |    5.8K              |
|    layers.0.swin2                |    0.6M              |
|    layers.0.adjust2              |    6.8K              |
|    layers.0.swin3                |    

In [5]:
import torch, os, sys
from torch import nn

from drct.archs.DRCT_arch import DRCT  # ✅ 여기만 바뀜

model = DRCT(
    img_size=64,
    patch_size=1,
    in_chans=3,
    embed_dim=180,
    depths=(6, 6, 6, 6,6,6),
    num_heads=(6, 6, 6, 6,6,6),
    window_size=16,
    compress_ratio=3,
    squeeze_factor=30,
    conv_scale=0.01,
    overlap_ratio=0.5,
    mlp_ratio=2.,
    qkv_bias=True,
    qk_scale=None,
    drop_rate=0.,
    attn_drop_rate=0.,
    drop_path_rate=0.1,
    norm_layer=nn.LayerNorm,
    ape=False,
    patch_norm=True,
    use_checkpoint=False,
    upscale=4,
    img_range=1.,
    upsampler='pixelshuffle',
    resi_connection='1conv',
    gc=32
)

print("model ok")


>>> patch_size = 1 img_size = 64 patches_resolution = [64, 64]
model ok


In [6]:
from fvcore.nn import FlopCountAnalysis, parameter_count_table

# ✅ 모델 입력(더미 입력) 크기 정의
input_tensor = torch.randn(1, 3, 64, 64)

# ✅ FLOPs(부동소수점 연산량) 계산
# flop_count = FlopCountAnalysis(model, input_tensor)
# flops = flop_count.total()

# ✅ 파라미터 수 계산
# - parameter_count_table(model): 레이어별 파라미터 수를 보기 좋게 표 형태로 출력
# - params: 전체 파라미터 수(직접 합산)
params_table = parameter_count_table(model)
params = sum(p.numel() for p in model.parameters())

# ✅ Multi-Adds(MACs) 및 forward 횟수 설정
# - 관례적으로 1 MAC ≈ 2 FLOPs(곱 1 + 덧셈 1)로 보는 경우가 많아서 FLOPs/2로 계산
# multi_adds = flops / 2
forward_passes = 1  # 더미 입력으로 forward를 몇 번 계산했는지(여기서는 1번)

# ✅ 결과 출력
print(params_table)
print(f"Params: {params}")
# print(f"FLOPs: {flops}")
# print(f"Multi-Adds: {multi_adds}")
print(f"Forward Passes: {forward_passes}")


| name                             | #elements or shape   |
|:---------------------------------|:---------------------|
| model                            | 14.1M                |
|  conv_first                      |  5.0K                |
|   conv_first.weight              |   (180, 3, 3, 3)     |
|   conv_first.bias                |   (180,)             |
|  patch_embed                     |  0.4K                |
|   patch_embed.norm               |   0.4K               |
|    patch_embed.norm.weight       |    (180,)            |
|    patch_embed.norm.bias         |    (180,)            |
|  layers                          |  13.4M               |
|   layers.0                       |   2.2M               |
|    layers.0.swin1                |    0.3M              |
|    layers.0.adjust1              |    5.8K              |
|    layers.0.swin2                |    0.4M              |
|    layers.0.adjust2              |    6.8K              |
|    layers.0.swin3                |    

In [45]:
import copy
import torch
from fvcore.nn import FlopCountAnalysis, parameter_count_table

# ✅ (추천) FLOPs는 CPU에서: trace OOM/이상동작 줄어듦
model_eval = copy.deepcopy(model).cpu().eval()
input_tensor = torch.randn(1, 3, 64, 64)  # CPU tensor

with torch.no_grad():
    flop_count = FlopCountAnalysis(model_eval, input_tensor)

    # ✅ 먼저 total()을 호출해야 분석 결과가 생성됨
    flops = flop_count.total()

    # ✅ 이제부터 unsupported/uncalled 조회 가능
    print("unsupported_ops:", flop_count.unsupported_ops())
    print("uncalled_modules:", flop_count.uncalled_modules())
    print("FLOPs total:", flops)

# ✅ 파라미터 수 (CPU 모델 기준)
params_table = parameter_count_table(model_eval)
params = sum(p.numel() for p in model_eval.parameters())

# ✅ Multi-Adds(MACs)
multi_adds = flops / 2
forward_passes = 1

# ✅ 결과 출력
print(params_table)
print(f"Params: {params}")
print(f"FLOPs: {flops}")
print(f"Multi-Adds: {multi_adds}")
print(f"Forward Passes: {forward_passes}")


>>> x_size: (tensor(64), tensor(64)) token L: tensor(4096) expected(H*W): tensor(4096)


RuntimeError: Given normalized_shape=[180], expected input with shape [*, 180], but got input of size[1, 180, 4096]

In [49]:
from fvcore.nn import FlopCountAnalysis, parameter_count_table

# ✅ 모델 입력(더미 입력) 크기 정의
input_tensor = torch.randn(1, 3, 64, 64)

# ✅ FLOPs(부동소수점 연산량) 계산
# flop_count = FlopCountAnalysis(model, input_tensor)
# flops = flop_count.total()

# ✅ 파라미터 수 계산
# - parameter_count_table(model): 레이어별 파라미터 수를 보기 좋게 표 형태로 출력
# - params: 전체 파라미터 수(직접 합산)
params_table = parameter_count_table(model)
params = sum(p.numel() for p in model.parameters())

# ✅ Multi-Adds(MACs) 및 forward 횟수 설정
# - 관례적으로 1 MAC ≈ 2 FLOPs(곱 1 + 덧셈 1)로 보는 경우가 많아서 FLOPs/2로 계산
# multi_adds = flops / 2
forward_passes = 1  # 더미 입력으로 forward를 몇 번 계산했는지(여기서는 1번)

# ✅ 결과 출력
print(params_table)
print(f"Params: {params}")
# print(f"FLOPs: {flops}")
# print(f"Multi-Adds: {multi_adds}")
print(f"Forward Passes: {forward_passes}")


| name                             | #elements or shape   |
|:---------------------------------|:---------------------|
| model                            | 4.4M                 |
|  conv_first                      |  2.7K                |
|   conv_first.weight              |   (96, 3, 3, 3)      |
|   conv_first.bias                |   (96,)              |
|  patch_embed                     |  0.2K                |
|   patch_embed.norm               |   0.2K               |
|    patch_embed.norm.weight       |    (96,)             |
|    patch_embed.norm.bias         |    (96,)             |
|  layers                          |  3.9M                |
|   layers.0                       |   1.0M               |
|    layers.0.swin1                |    76.1K             |
|    layers.0.adjust1              |    3.1K              |
|    layers.0.swin2                |    0.1M              |
|    layers.0.adjust2              |    4.1K              |
|    layers.0.swin3                |    

In [47]:
import torch
import pandas as pd

def layer_param_breakdown(model, topk=50):
    rows = []
    for name, p in model.named_parameters():
        rows.append({
            "param_name": name,
            "module": name.rsplit(".", 1)[0] if "." in name else name,
            "shape": list(p.shape),
            "num_params": p.numel(),
            "trainable": p.requires_grad
        })
    df = pd.DataFrame(rows)

    # module 단위로 합치기
    by_module = (df.groupby("module", as_index=False)["num_params"]
                   .sum()
                   .sort_values("num_params", ascending=False))
    total = by_module["num_params"].sum()
    by_module["percent"] = (by_module["num_params"] / total * 100).round(2)

    print(f"Total params: {total:,}")
    print(by_module.head(topk).to_string(index=False))
    return by_module, df

by_module, by_param = layer_param_breakdown(model, topk=80)

answer = input("파일명 suffix를 입력 (예: _v1, _small) : ").strip()
suffix = answer if answer else ""
by_module.to_csv(f"param_by_module{suffix}.csv", index=False, encoding="utf-8-sig")
by_param.to_csv(f"param_by_param{suffix}.csv", index=False, encoding="utf-8-sig")

Total params: 14,139,579
                  module  num_params  percent
         conv_after_body      291780     2.06
 layers.3.swin5.attn.qkv      285516     2.02
 layers.0.swin5.attn.qkv      285516     2.02
 layers.5.swin5.attn.qkv      285516     2.02
 layers.1.swin5.attn.qkv      285516     2.02
 layers.4.swin5.attn.qkv      285516     2.02
 layers.2.swin5.attn.qkv      285516     2.02
 layers.1.swin4.attn.qkv      229356     1.62
 layers.2.swin4.attn.qkv      229356     1.62
 layers.3.swin4.attn.qkv      229356     1.62
 layers.4.swin4.attn.qkv      229356     1.62
 layers.5.swin4.attn.qkv      229356     1.62
 layers.0.swin4.attn.qkv      229356     1.62
 layers.0.swin3.attn.qkv      179340     1.27
 layers.3.swin3.attn.qkv      179340     1.27
 layers.4.swin3.attn.qkv      179340     1.27
 layers.2.swin3.attn.qkv      179340     1.27
 layers.5.swin3.attn.qkv      179340     1.27
 layers.1.swin3.attn.qkv      179340     1.27
              upsample.0      147712     1.04
         

In [4]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("CUDA device name:", torch.cuda.get_device_name(0))


CUDA available: True
CUDA device count: 5
CUDA device name: NVIDIA A100-SXM4-40GB


In [4]:
import torch
import torchvision.models as models
from fvcore.nn import FlopCountAnalysis, parameter_count_table
import os 
os.environ['CUDA_VISIBLE_DEVICES']='4'
# 加载模型
# 定义输入大小
input_tensor = torch.randn(1, 3, 64, 64).cuda()  # 将输入张量移动到 GPU

# 将模型移动到 GPU
model = model.cuda()

# 计算 FLOPs
flop_count = FlopCountAnalysis(model, input_tensor)
flops = flop_count.total()

# 计算参数量
params_table = parameter_count_table(model)
params = sum(p.numel() for p in model.parameters())

# 计算 Multi-Adds 和前向传播次数
multi_adds = flops / 2
forward_passes = 1

# 输出结果
print(params_table)
print(f"Params: {params}")
print(f"FLOPs: {flops}")
print(f"Multi-Adds: {multi_adds}")
print(f"Forward Passes: {forward_passes}")

# 释放 GPU 内存
del input_tensor, model
torch.cuda.empty_cache()


  x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
  qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
  x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
  attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)


RuntimeError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 3.81 GiB total capacity; 2.59 GiB already allocated; 8.44 MiB free; 2.74 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF