<a href="https://colab.research.google.com/github/kimdododo/festival-analysis-pipeline1/blob/main/mfu2%EC%9D%BC%EC%B0%A8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch torchvision
!pip install thop
!pip install fvcore
!pip install ptflops
!pip install torchprofile

Collecting thop
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Installing collected packages: thop
Successfully installed thop-0.1.1.post2209072238
Collecting fvcore
  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yacs>=0.1.6 (from fvcore)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting iopath>=0.1.7 (from fvcore)
  Downloading iopath-0.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting portalocker (from iopath>=0.1.7->fvcore)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading yacs-0.1.8-py3-no

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import time
from typing import Dict, List, Tuple
import matplotlib.pyplot as plt

In [3]:
def get_gpu_info():
    """GPU 정보 및 이론적 최대 FLOPS 확인"""
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9

        # GPU별 이론적 TFLOPS (대략적인 값)
        theoretical_tflops = {
            'A100': 312,  # FP16 Tensor Core
            'V100': 125,  # FP16 Tensor Core
            'T4': 65,     # FP16 Tensor Core
            'P100': 21.2, # FP16
        }

        print(f"GPU: {gpu_name}")
        print(f"Memory: {gpu_memory:.2f} GB")

        for gpu_model, tflops in theoretical_tflops.items():
            if gpu_model in gpu_name:
                print(f"Theoretical Peak Performance: {tflops} TFLOPS (FP16)")
                return tflops * 1e12  # Convert to FLOPS

    return None

peak_flops = get_gpu_info()
peak_flops

In [4]:
class FLOPsCalculator:
    """각 레이어별 FLOPs를 수동으로 계산하는 클래스"""

    @staticmethod
    def conv2d_flops(in_channels, out_channels, kernel_size, input_size, stride=1, padding=0):
        """Conv2D 레이어의 FLOPs 계산

        FLOPs = 2 × K² × C_in × C_out × H_out × W_out
        """
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)
        if isinstance(stride, int):
            stride = (stride, stride)
        if isinstance(padding, int):
            padding = (padding, padding)

        h_out = (input_size[0] + 2 * padding[0] - kernel_size[0]) // stride[0] + 1
        w_out = (input_size[1] + 2 * padding[1] - kernel_size[1]) // stride[1] + 1

        # 곱셈과 덧셈 연산
        multiplications = kernel_size[0] * kernel_size[1] * in_channels * out_channels * h_out * w_out
        additions = (kernel_size[0] * kernel_size[1] * in_channels - 1) * out_channels * h_out * w_out

        # Bias 추가 (optional)
        bias_additions = out_channels * h_out * w_out

        total_flops = multiplications + additions + bias_additions

        return total_flops, (h_out, w_out)

    @staticmethod
    def linear_flops(in_features, out_features, batch_size=1):
        """Linear 레이어의 FLOPs 계산

        FLOPs = 2 × in_features × out_features × batch_size
        """
        multiplications = in_features * out_features * batch_size
        additions = (in_features - 1) * out_features * batch_size
        bias_additions = out_features * batch_size

        return multiplications + additions + bias_additions

    @staticmethod
    def attention_flops(seq_len, d_model, num_heads, batch_size=1):
        """Multi-Head Attention의 FLOPs 계산

        Q, K, V projection + Attention scores + Output projection
        """
        d_head = d_model // num_heads

        # Q, K, V projections
        qkv_flops = 3 * FLOPsCalculator.linear_flops(d_model, d_model, batch_size * seq_len)

        # Attention scores: Q @ K^T
        attention_scores = 2 * batch_size * num_heads * seq_len * seq_len * d_head

        # Softmax (approximated as seq_len operations per position)
        softmax_flops = batch_size * num_heads * seq_len * seq_len * 5  # rough approximation

        # Attention @ V
        attention_output = 2 * batch_size * num_heads * seq_len * seq_len * d_head

        # Output projection
        output_projection = FLOPsCalculator.linear_flops(d_model, d_model, batch_size * seq_len)

        total_flops = qkv_flops + attention_scores + softmax_flops + attention_output + output_projection

        return total_flops

In [5]:
# 간단한 CNN 모델 정의
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(256 * 4 * 4, 512)
        self.fc2 = nn.Linear(512, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(-1, 256 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# FLOPs 계산
def calculate_model_flops(model, input_size=(32, 32)):
    """모델의 총 FLOPs 계산"""
    calculator = FLOPsCalculator()
    total_flops = 0
    layer_flops = {}

    # Conv1: 3 -> 64
    flops, output_size = calculator.conv2d_flops(3, 64, 3, input_size, padding=1)
    layer_flops['conv1'] = flops
    total_flops += flops
    output_size = (output_size[0]//2, output_size[1]//2)  # After pooling

    # Conv2: 64 -> 128
    flops, output_size = calculator.conv2d_flops(64, 128, 3, output_size, padding=1)
    layer_flops['conv2'] = flops
    total_flops += flops
    output_size = (output_size[0]//2, output_size[1]//2)  # After pooling

    # Conv3: 128 -> 256
    flops, output_size = calculator.conv2d_flops(128, 256, 3, output_size, padding=1)
    layer_flops['conv3'] = flops
    total_flops += flops
    output_size = (output_size[0]//2, output_size[1]//2)  # After pooling

    # FC1: 256*4*4 -> 512
    flops = calculator.linear_flops(256 * 4 * 4, 512)
    layer_flops['fc1'] = flops
    total_flops += flops

    # FC2: 512 -> 10
    flops = calculator.linear_flops(512, 10)
    layer_flops['fc2'] = flops
    total_flops += flops

    return total_flops, layer_flops

model = SimpleCNN()
total_flops, layer_flops = calculate_model_flops(model)

print(f"Total FLOPs: {total_flops:,}")
print("\nLayer-wise FLOPs:")
for layer, flops in layer_flops.items():
    print(f"  {layer}: {flops:,} ({flops/total_flops*100:.2f}%)")

Total FLOPs: 83,240,960

Layer-wise FLOPs:
  conv1: 3,538,944 (4.25%)
  conv2: 37,748,736 (45.35%)
  conv3: 37,748,736 (45.35%)
  fc1: 4,194,304 (5.04%)
  fc2: 10,240 (0.01%)


In [6]:
from thop import profile, clever_format

def profile_with_thop(model, input_size=(1, 3, 32, 32)):
    """THOP을 사용한 FLOPs 프로파일링"""
    input_tensor = torch.randn(input_size)

    # FLOPs와 Parameters 계산
    flops, params = profile(model, inputs=(input_tensor,))

    # 읽기 쉬운 형태로 변환
    flops, params = clever_format([flops, params], "%.3f")

    print(f"Model FLOPs: {flops}")
    print(f"Model Parameters: {params}")

    return flops, params

# 모델 프로파일링
model = SimpleCNN()
flops, params = profile_with_thop(model)

[INFO] Register count_convNd() for <class 'torch.nn.modules.conv.Conv2d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.pooling.MaxPool2d'>.
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
Model FLOPs: 41.620M
Model Parameters: 2.474M


In [7]:
from fvcore.nn import FlopCountAnalysis, parameter_count

def profile_with_fvcore(model, input_size=(1, 3, 32, 32)):
    """FVCore를 사용한 상세 FLOPs 분석"""
    input_tensor = torch.randn(input_size)

    # FLOPs 분석
    flops = FlopCountAnalysis(model, input_tensor)

    # 총 FLOPs
    total_flops = flops.total()

    # 레이어별 FLOPs
    layer_flops = flops.by_module()

    # 연산 타입별 FLOPs
    op_flops = flops.by_operator()

    print(f"Total FLOPs: {total_flops:,}")
    print("\nFLOPs by Layer:")
    for name, flops_count in layer_flops.items():
        if flops_count > 0:
            print(f"  {name}: {flops_count:,}")

    print("\nFLOPs by Operation Type:")
    for op, flops_count in op_flops.items():
        if flops_count > 0:
            print(f"  {op}: {flops_count:,}")

    return total_flops, layer_flops, op_flops

# FVCore로 분석
total_flops, layer_flops, op_flops = profile_with_fvcore(model)



Total FLOPs: 41,620,480

FLOPs by Layer:
  : 41,620,480
  conv1: 1,769,472
  conv2: 18,874,368
  conv3: 18,874,368
  fc1: 2,097,152
  fc2: 5,120

FLOPs by Operation Type:
  conv: 39,518,208
  linear: 2,102,272


In [8]:
class LayerProfiler:
    """Hook을 사용한 레이어별 상세 프로파일링"""

    def __init__(self):
        self.layer_stats = {}

    def hook_fn(self, module, input, output, name):
        """각 레이어의 입출력 shape 및 FLOPs 기록"""
        input_shape = input[0].shape if isinstance(input, tuple) else input.shape
        output_shape = output.shape if hasattr(output, 'shape') else output[0].shape

        self.layer_stats[name] = {
            'input_shape': input_shape,
            'output_shape': output_shape,
            'module_type': module.__class__.__name__
        }

        # 간단한 FLOPs 추정
        if isinstance(module, nn.Conv2d):
            flops = self._conv_flops(module, output_shape)
            self.layer_stats[name]['flops'] = flops
        elif isinstance(module, nn.Linear):
            flops = self._linear_flops(module, output_shape)
            self.layer_stats[name]['flops'] = flops

    def _conv_flops(self, module, output_shape):
        batch_size = output_shape[0]
        out_h, out_w = output_shape[2], output_shape[3]
        kernel_h, kernel_w = module.kernel_size
        in_channels = module.in_channels
        out_channels = module.out_channels

        return 2 * batch_size * out_h * out_w * in_channels * out_channels * kernel_h * kernel_w

    def _linear_flops(self, module, output_shape):
        batch_size = output_shape[0]
        return 2 * batch_size * module.in_features * module.out_features

    def profile_model(self, model, input_tensor):
        """모델 전체 프로파일링"""
        handles = []

        # 각 레이어에 hook 등록
        for name, module in model.named_modules():
            if len(list(module.children())) == 0:  # Leaf modules only
                handle = module.register_forward_hook(
                    lambda m, i, o, n=name: self.hook_fn(m, i, o, n)
                )
                handles.append(handle)

        # Forward pass
        with torch.no_grad():
            _ = model(input_tensor)

        # Hook 제거
        for handle in handles:
            handle.remove()

        return self.layer_stats

# 프로파일링 실행
profiler = LayerProfiler()
input_tensor = torch.randn(1, 3, 32, 32)
layer_stats = profiler.profile_model(model, input_tensor)

print("Layer-wise Statistics:")
for name, stats in layer_stats.items():
    print(f"\n{name} ({stats['module_type']}):")
    print(f"  Input shape: {stats['input_shape']}")
    print(f"  Output shape: {stats['output_shape']}")
    if 'flops' in stats:
        print(f"  FLOPs: {stats['flops']:,}")

Layer-wise Statistics:

conv1 (Conv2d):
  Input shape: torch.Size([1, 3, 32, 32])
  Output shape: torch.Size([1, 64, 32, 32])
  FLOPs: 3,538,944

pool (MaxPool2d):
  Input shape: torch.Size([1, 256, 8, 8])
  Output shape: torch.Size([1, 256, 4, 4])

conv2 (Conv2d):
  Input shape: torch.Size([1, 64, 16, 16])
  Output shape: torch.Size([1, 128, 16, 16])
  FLOPs: 37,748,736

conv3 (Conv2d):
  Input shape: torch.Size([1, 128, 8, 8])
  Output shape: torch.Size([1, 256, 8, 8])
  FLOPs: 37,748,736

fc1 (Linear):
  Input shape: torch.Size([1, 4096])
  Output shape: torch.Size([1, 512])
  FLOPs: 4,194,304

fc2 (Linear):
  Input shape: torch.Size([1, 512])
  Output shape: torch.Size([1, 10])
  FLOPs: 10,240


In [9]:
def measure_throughput(model, batch_size, input_size=(3, 32, 32), num_iterations=100):
    """모델의 실제 처리량(throughput) 측정"""
    model = model.cuda()
    model.eval()

    # Warm-up
    dummy_input = torch.randn(batch_size, *input_size).cuda()
    for _ in range(10):
        _ = model(dummy_input)

    torch.cuda.synchronize()

    # 실제 측정
    start_time = time.time()

    for _ in range(num_iterations):
        with torch.no_grad():
            _ = model(dummy_input)

    torch.cuda.synchronize()
    end_time = time.time()

    # 처리량 계산
    elapsed_time = end_time - start_time
    throughput = (batch_size * num_iterations) / elapsed_time

    return throughput, elapsed_time

# 다양한 배치 크기로 테스트
batch_sizes = [1, 8, 16, 32, 64, 128]
throughputs = []

for bs in batch_sizes:
    try:
        throughput, elapsed_time = measure_throughput(model, bs)
        throughputs.append(throughput)
        print(f"Batch size {bs}: {throughput:.2f} samples/sec")
    except RuntimeError as e:
        print(f"Batch size {bs}: OOM")
        throughputs.append(0)

Batch size 1: OOM
Batch size 8: OOM
Batch size 16: OOM
Batch size 32: OOM
Batch size 64: OOM
Batch size 128: OOM


In [10]:
def calculate_mfu_fixed(model, batch_size, input_size=(3, 32, 32), peak_flops=None):
    """
    Model FLOPs Utilization 계산 (Device 오류 수정 버전)

    Args:
        model: PyTorch 모델
        batch_size: 배치 크기
        input_size: 입력 크기 (C, H, W)
        peak_flops: GPU의 이론적 최대 FLOPS

    Returns:
        dict: MFU 계산 결과
    """

    # 1. Device 확인 및 설정
    device = next(model.parameters()).device
    print(f"📍 모델 device: {device}")

    # 2. 입력 텐서를 모델과 같은 device에 생성
    input_tensor = torch.randn(batch_size, *input_size).to(device)
    print(f"📍 입력 텐서 device: {input_tensor.device}")

    # 3. FLOPs 계산
    try:
        from fvcore.nn import FlopCountAnalysis
        flops = FlopCountAnalysis(model, input_tensor).total()
    except Exception as e:
        print(f"⚠️ FVCore 에러: {e}")
        # 대체 방법 사용
        from thop import profile
        flops, _ = profile(model, inputs=(input_tensor,), verbose=False)

    # 4. 실제 처리량 측정 (수정된 버전)
    throughput, elapsed_time = measure_throughput_fixed(model, batch_size, input_size)

    # 5. 실제 FLOPS 계산
    actual_flops_per_sec = flops * throughput / batch_size

    # 6. MFU 계산
    if peak_flops:
        mfu = (actual_flops_per_sec / peak_flops) * 100
    else:
        mfu = None

    results = {
        'batch_size': batch_size,
        'model_flops': flops,
        'throughput': throughput,
        'actual_flops_per_sec': actual_flops_per_sec,
        'mfu_percentage': mfu,
        'device': str(device)
    }

    return results

def measure_throughput_fixed(model, batch_size, input_size=(3, 32, 32), num_iterations=100):
    """
    모델의 실제 처리량(throughput) 측정 (Device 오류 수정 버전)
    """
    # 모델의 device 확인
    device = next(model.parameters()).device

    # 모델을 eval 모드로 설정
    model.eval()

    # Warm-up (중요!)
    print("🔥 Warming up...")
    dummy_input = torch.randn(batch_size, *input_size).to(device)  # device 지정

    with torch.no_grad():
        for _ in range(10):
            _ = model(dummy_input)

    # GPU 동기화
    if device.type == 'cuda':
        torch.cuda.synchronize()

    # 실제 측정
    print("📊 Measuring throughput...")
    start_time = time.time()

    with torch.no_grad():
        for _ in range(num_iterations):
            _ = model(dummy_input)

    # GPU 동기화
    if device.type == 'cuda':
        torch.cuda.synchronize()

    end_time = time.time()

    # 처리량 계산
    elapsed_time = end_time - start_time
    throughput = (batch_size * num_iterations) / elapsed_time

    print(f"✅ 측정 완료: {throughput:.2f} samples/sec")

    return throughput, elapsed_time

# ====================================
# 🎯 실행 예제
# ====================================

# 모델 생성 및 GPU 이동
model = SimpleCNN()

# GPU 사용 가능 여부 확인
if torch.cuda.is_available():
    model = model.cuda()
    print("✅ GPU 사용 중")
else:
    print("⚠️ CPU 사용 중 (GPU 권장)")

# MFU 측정 (수정된 함수 사용)
results = calculate_mfu_fixed(model, batch_size=32, peak_flops=peak_flops)

# 결과 출력
print("\n" + "="*50)
print("📊 MFU 측정 결과")
print("="*50)
print(f"Device: {results['device']}")
print(f"Model FLOPs: {results['model_flops']:,}")
print(f"Throughput: {results['throughput']:.2f} samples/sec")
print(f"Actual FLOPS: {results['actual_flops_per_sec']:.2e}")
if results['mfu_percentage']:
    print(f"MFU: {results['mfu_percentage']:.2f}%")
else:
    print("MFU: N/A (peak_flops not provided)")

⚠️ CPU 사용 중 (GPU 권장)
📍 모델 device: cpu
📍 입력 텐서 device: cpu




🔥 Warming up...
📊 Measuring throughput...
✅ 측정 완료: 350.42 samples/sec

📊 MFU 측정 결과
Device: cpu
Model FLOPs: 1,331,855,360
Throughput: 350.42 samples/sec
Actual FLOPS: 1.46e+10
MFU: N/A (peak_flops not provided)


In [11]:
def analyze_mfu_vs_batch_size(model, batch_sizes, peak_flops=None):
    """배치 크기에 따른 MFU 변화 분석"""
    results = []

    for bs in batch_sizes:
        try:
            result = calculate_mfu_fixed(model, bs, peak_flops=peak_flops)
            results.append(result)
            print(f"Batch {bs}: MFU = {result['mfu_percentage']:.2f}%" if result['mfu_percentage'] else f"Batch {bs}: Completed")
        except RuntimeError:
            print(f"Batch {bs}: OOM")

    return results

# 분석 실행
batch_sizes = [1, 4, 8, 16, 32, 64]
mfu_results = analyze_mfu_vs_batch_size(model, batch_sizes, peak_flops)

# 시각화
if mfu_results and any(r['mfu_percentage'] for r in mfu_results):
    plt.figure(figsize=(10, 6))
    valid_results = [r for r in mfu_results if r['mfu_percentage']]

    batch_sizes_plot = [r['batch_size'] for r in valid_results]
    mfu_values = [r['mfu_percentage'] for r in valid_results]

    plt.plot(batch_sizes_plot, mfu_values, 'o-', linewidth=2, markersize=8)
    plt.xlabel('Batch Size')
    plt.ylabel('MFU (%)')
    plt.title('Model FLOPs Utilization vs Batch Size')
    plt.grid(True, alpha=0.3)
    plt.xscale('log', base=2)
    plt.show()



📍 모델 device: cpu
📍 입력 텐서 device: cpu
🔥 Warming up...
📊 Measuring throughput...




✅ 측정 완료: 290.91 samples/sec
Batch 1: Completed
📍 모델 device: cpu
📍 입력 텐서 device: cpu
🔥 Warming up...
📊 Measuring throughput...




✅ 측정 완료: 362.13 samples/sec
Batch 4: Completed
📍 모델 device: cpu
📍 입력 텐서 device: cpu
🔥 Warming up...
📊 Measuring throughput...




✅ 측정 완료: 386.78 samples/sec
Batch 8: Completed
📍 모델 device: cpu
📍 입력 텐서 device: cpu
🔥 Warming up...
📊 Measuring throughput...




✅ 측정 완료: 348.15 samples/sec
Batch 16: Completed
📍 모델 device: cpu
📍 입력 텐서 device: cpu
🔥 Warming up...
📊 Measuring throughput...
✅ 측정 완료: 369.10 samples/sec
Batch 32: Completed
📍 모델 device: cpu
📍 입력 텐서 device: cpu




🔥 Warming up...
📊 Measuring throughput...
✅ 측정 완료: 419.00 samples/sec
Batch 64: Completed
