<a href="https://colab.research.google.com/github/kimdododo/festival-analysis-pipeline1/blob/main/mfu_1%EC%9D%BC%EC%B0%A8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys, torch, platform
print("Python:", sys.version)
print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
PyTorch: 2.8.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


In [2]:
pip install calflops

Collecting calflops
  Downloading calflops-0.3.2-py3-none-any.whl.metadata (28 kB)
Downloading calflops-0.3.2-py3-none-any.whl (29 kB)
Installing collected packages: calflops
Successfully installed calflops-0.3.2


In [3]:
import torch
from calflops import calculate_flops

def parse_flop_string(s):
    return float(s.strip().split()[0])

model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', weights='ResNet18_Weights.DEFAULT')

flops, macs, params = calculate_flops(
    model=model,
    input_shape=(1, 3, 224, 224),
    print_results=False
)

print(f"FLOPs: {parse_flop_string(flops):.2f}G, MACs: {parse_flop_string(macs):.2f}G")

Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to /root/.cache/torch/hub/v0.10.0.zip
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 191MB/s]


FLOPs: 3.64G, MACs: 1.81G


In [4]:
import torch, time, os

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    os.system("nvidia-smi | head -n 20")

CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


In [5]:
import torch.nn as nn

class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(3, 32, 3, stride=2, padding=1)
        self.fc = nn.Linear(32 * 112 * 112, 10)
    def forward(self, x):
        x = torch.relu(self.conv(x))
        x = x.view(x.size(0), -1)
        return self.fc(x)

model = SimpleCNN().cuda().eval()

In [6]:
# FLOPs = 2 * H * W * Cin * Cout * Kh * Kw
H, W, Cin, Cout, Kh, Kw = 224, 224, 3, 32, 3, 3
flops_conv = 2 * H/2 * W/2 * Cin * Cout * Kh * Kw  # stride=2 → H/2,W/2
flops_fc = 2 * (32 * 112 * 112) * 10
total_flops = flops_conv + flops_fc
print(f"총 FLOPs: {total_flops/1e9:.3f} GFLOPs")

총 FLOPs: 0.030 GFLOPs


In [7]:
x = torch.randn(32, 3, 224, 224).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
y = torch.randint(0, 10, (32,)).cuda()

torch.cuda.synchronize()
start = time.time()

for _ in range(50):  # 50 iterations
    optimizer.zero_grad()
    out = model(x)
    loss = criterion(out, y)
    loss.backward()
    optimizer.step()

torch.cuda.synchronize()
end = time.time()
train_time = (end - start) / 50
print(f"평균 반복당 학습 시간: {train_time:.4f} 초")

평균 반복당 학습 시간: 0.0221 초


In [8]:
gpu_theoretical_flops = 19.5e12  # A100 기준 (FP32)
mfu = (total_flops / train_time) / gpu_theoretical_flops * 100





print(f"🔹 MFU(Model FLOPs Utilization): {mfu:.2f}%")

🔹 MFU(Model FLOPs Utilization): 0.01%


In [9]:
!nvidia-smi --query-gpu=utilization.gpu,utilization.memory,memory.used --format=csv


utilization.gpu [%], utilization.memory [%], memory.used [MiB]
21 %, 14 %, 809 MiB


In [10]:
import torch
import torch.nn as nn

# FC (Linear) 레이어 정의
N_in, N_out = 4, 3
fc = nn.Linear(N_in, N_out, bias=False)

# 입력 데이터 (배치 크기 1)
x = torch.randn(1, N_in)
y = fc(x)

print("입력 크기:", x.shape)
print("출력 크기:", y.shape)

# FLOPs 계산
flops = 2 * N_in * N_out
print(f"이론적 FLOPs: {flops} 회 연산 (곱셈 + 덧셈 포함)")

입력 크기: torch.Size([1, 4])
출력 크기: torch.Size([1, 3])
이론적 FLOPs: 24 회 연산 (곱셈 + 덧셈 포함)


In [11]:
import torch
import torch.nn as nn
import numpy as np

# 입력 및 커널 정의
x = torch.randn(1, 3, 4, 4)   # (배치, 채널, 높이, 너비)
conv = nn.Conv2d(in_channels=3, out_channels=1, kernel_size=3, stride=1, padding=0)

# 연산 수행
y = conv(x)
print("출력 크기:", y.shape)

# FLOPs 계산 공식 적용
H_out, W_out = y.shape[2], y.shape[3]
K_h, K_w = conv.kernel_size
C_in, C_out = conv.in_channels, conv.out_channels

flops = 2 * H_out * W_out * K_h * K_w * C_in * C_out
print(f"이론적 FLOPs: {flops} 회 연산")

출력 크기: torch.Size([1, 1, 2, 2])
이론적 FLOPs: 216 회 연산


In [12]:
import torch
import torch.nn as nn

# 간단한 모델 정의
model = nn.Sequential(
    nn.Conv2d(3, 16, 3, stride=1, padding=1),
    nn.ReLU(),
    nn.Conv2d(16, 32, 3, stride=1, padding=1),
    nn.ReLU(),
    nn.Flatten(),
    nn.Linear(32 * 32 * 32, 10)
)

x = torch.randn(1, 3, 32, 32)

# MACs 계산 예시
H_out, W_out = 32, 32
K_h, K_w = 3, 3
C_in, C_out = 3, 16

conv1_macs = H_out * W_out * K_h * K_w * C_in * C_out
print(f"Conv1 MACs: {conv1_macs / 1e6:.2f} MMACs")

Conv1 MACs: 0.44 MMACs


In [13]:
import torch
import torch.profiler as profiler

In [14]:
model = torch.nn.Linear(1024, 1024).cuda()
input_data = torch.randn(16, 1024).cuda()

In [15]:
# [2] 프로파일링용 기본 설정값
batch_size = 16
iterations = 50   # 총 반복 횟수

In [16]:

# [3] torch.profiler로 CPU/GPU 시간 측정
with profiler.profile(
    activities=[profiler.ProfilerActivity.CPU, profiler.ProfilerActivity.CUDA],
    record_shapes=True
) as prof:
    for i in range(iterations):
        output = model(input_data)
        loss = output.sum()
        loss.backward()
        torch.cuda.synchronize()   # GPU 연산 완료 대기 (정확한 시간 측정용)

In [17]:


# [4] 프로파일링 결과에서 평균 수행시간 계산
events = prof.key_averages()   # 개별 연산별 집계
total_cpu_time = sum([e.self_cpu_time_total for e in events]) / 1e6  # (ms → s)
time_per_iter = total_cpu_time / iterations
print(f"🕒 반복 1회당 평균 수행시간: {time_per_iter:.6f} 초")

🕒 반복 1회당 평균 수행시간: 0.001547 초


In [18]:
# [5] 처리량(Throughput) 계산
throughput = batch_size / time_per_iter
print(f"처리량(Throughput): {throughput:.2f} 샘플/초")

처리량(Throughput): 10343.19 샘플/초


In [19]:
flops_per_sample = 1e9  # 예시: 1 GFLOP / 샘플
total_flops = flops_per_sample * batch_size

In [20]:
gpu_peak_flops = 19.5 * 1e12  # FP32 기준

In [21]:
mfu = (total_flops / time_per_iter) / gpu_peak_flops
print(f"🔥 MFU (Model FLOPs Utilization): {mfu:.2%}")

🔥 MFU (Model FLOPs Utilization): 53.04%


In [22]:
import time
import torch
import torch.nn as nn

device = torch.device('cuda')
torch.backends.cudnn.benchmark = True  # conv 가속

class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(3, 32, 3, stride=2, padding=1)
        self.fc = nn.Linear(32 * 112 * 112, 10)
    def forward(self, x):
        x = torch.relu(self.conv(x))
        x = x.view(x.size(0), -1)
        return self.fc(x)

model = SimpleCNN().to(device).train()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# ---- FLOPs per-sample (forward) ----
H, W, Cin, Cout, Kh, Kw = 224, 224, 3, 32, 3, 3
Hout, Wout = H//2, W//2  # stride=2
flops_conv_fwd = 2 * Hout * Wout * Cin * Cout * Kh * Kw
flops_fc_fwd   = 2 * (32 * 112 * 112) * 10
fwd_flops_per_sample = flops_conv_fwd + flops_fc_fwd

batch_size = 32
x = torch.randn(batch_size, 3, 224, 224, device=device)
y = torch.randint(0, 10, (batch_size,), device=device)

# ---- warmup ----
for _ in range(10):
    optimizer.zero_grad(set_to_none=True)
    loss = criterion(model(x), y)
    loss.backward()
    optimizer.step()
torch.cuda.synchronize()

# ---- timing ----
iters = 50
t0 = time.time()
for _ in range(iters):
    optimizer.zero_grad(set_to_none=True)
    out = model(x)
    loss = criterion(out, y)
    loss.backward()
    optimizer.step()
torch.cuda.synchronize()
t1 = time.time()

iter_time = (t1 - t0) / iters

# ---- FLOPs per-iter (train) ≈ 3x forward ----
iter_flops = batch_size * fwd_flops_per_sample * 3

# ---- GPU peak FLOPs: 정밀도에 맞춰 설정 ----
# FP32 예시: A100 ≈ 19.5e12 FLOPs/s
gpu_peak_flops = 19.5e12

mfu = (iter_flops / iter_time) / gpu_peak_flops * 100.0
print(f"평균 반복당 시간: {iter_time:.4f}s")
print(f"MFU ≈ {mfu:.2f}%  (FP32 기준)")


평균 반복당 시간: 0.0013s
MFU ≈ 11.04%  (FP32 기준)
