In [1]:
import torch
import torch.nn as nn
import time

from einops import rearrange
from einops.layers.torch import Rearrange

# GPU 사용 가능 여부 확인
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 실험을 위한 임의의 텐서 생성
N, C, H, W = 32, 128, 56, 56  # 배치 크기, 채널 수, 높이, 너비
x = torch.randn(N, C, H, W, device=device)

# 1x1 Conv2d를 사용하는 경우
class Conv1x1(nn.Module):
    def __init__(self, channels):
        super(Conv1x1, self).__init__()
        self.conv = nn.Conv2d(channels, channels, kernel_size=1)
        self.rearrange = Rearrange('b c h w -> b h w c')

    def forward(self, x):
        N, C, H, W = x.shape
        x = self.conv(x)
        # x = x.permute(0,2,3,1).reshape(-1, C)
        x = self.rearrange(x)
        return x

# Linear를 사용하는 경우
class LinearConv(nn.Module):
    def __init__(self, channels):
        super(LinearConv, self).__init__()
        self.linear = nn.Linear(channels, channels)

    def forward(self, x):
        N, C, H, W = x.shape
        # x = x.permute(0, 2, 3, 1).reshape(-1, C)  # (N, H, W, C)
        x = x.reshape(-1, C)
        x = self.linear(x)
        # x = x.view(N, H, W, C).permute(0, 3, 1, 2)  # (N, C, H, W)
        return x

# 메모리 및 시간 측정 함수
def measure_time_and_memory(model, input):
    torch.cuda.synchronize()  # 현재 모든 스트림이 끝날 때까지 기다립니다
    start_time = time.time()
    start_memory = torch.cuda.memory_allocated()
    
    output = model(input)
    print(output.shape)
    
    torch.cuda.synchronize()
    end_time = time.time()
    end_memory = torch.cuda.memory_allocated()
    
    time_elapsed = end_time - start_time
    memory_used = end_memory - start_memory
    return time_elapsed, memory_used

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 모델 및 데이터를 GPU로 이동
conv1x1_model = Conv1x1(C).to(device)
linear_model = LinearConv(C).to(device)
x = x.to(device)

# 실험 실행
time_conv, memory_conv = measure_time_and_memory(conv1x1_model, x)
time_linear, memory_linear = measure_time_and_memory(linear_model, x)

print(f"Conv1x1 - Time: {time_conv:.6f}s, Memory: {memory_conv} bytes")
print(f"Linear - Time: {time_linear:.6f}s, Memory: {memory_linear} bytes")

torch.Size([32, 56, 56, 128])
torch.Size([100352, 128])
Conv1x1 - Time: 0.138421s, Memory: 52428800 bytes
Linear - Time: 0.002460s, Memory: 60948480 bytes


In [3]:
# 모델 및 데이터를 GPU로 이동
conv1x1_model = Conv1x1(C).to(device)
linear_model = LinearConv(C).to(device)
x = x.to(device)

# 실험 실행
time_conv, memory_conv = measure_time_and_memory(conv1x1_model, x)
time_linear, memory_linear = measure_time_and_memory(linear_model, x)

print(f"Conv1x1 - Time: {time_conv:.6f}s, Memory: {memory_conv} bytes")
print(f"Linear - Time: {time_linear:.6f}s, Memory: {memory_linear} bytes")

torch.Size([32, 56, 56, 128])
torch.Size([100352, 128])
Conv1x1 - Time: 0.000477s, Memory: 52428800 bytes
Linear - Time: 0.000560s, Memory: 52428800 bytes


In [4]:
# 모델 정의
class Conv1x1(nn.Module):
    def __init__(self, channels):
        super(Conv1x1, self).__init__()
        self.conv = nn.Conv2d(channels, channels, kernel_size=1)

    def forward(self, x):
        return self.conv(x)

class LinearConv(nn.Module):
    def __init__(self, channels):
        super(LinearConv, self).__init__()
        self.linear = nn.Linear(channels, channels)

    def forward(self, x):
        N, C, H, W = x.shape
        x = x.permute(0, 2, 3, 1).reshape(-1, C)
        x = self.linear(x)
        x = x.view(N, H, W, C).permute(0, 3, 1, 2)
        return x

# 모델 초기화 및 동일한 가중치 설정
def initialize_models(channels):
    conv1x1 = Conv1x1(channels).to(device)
    linear = LinearConv(channels).to(device)

    # Conv1x1 모델의 가중치를 추출
    conv_weights = conv1x1.conv.weight.data
    conv_bias = conv1x1.conv.bias.data

    # Linear 모델의 가중치에 Conv1x1 모델의 가중치를 적용
    linear.linear.weight.data = conv_weights.squeeze().view(channels, channels)
    linear.linear.bias.data = conv_bias

    return conv1x1, linear

# 작은 텐서 생성 및 초기화
channels = 3  # 예를 들어 RGB 채널
small_tensor = torch.randn(1, channels, 224, 224, device=device) 

# 모델 및 데이터 초기화
conv1x1_model, linear_model = initialize_models(channels)

# 두 모델의 출력 비교
with torch.no_grad():  # 그래디언트 계산을 하지 않음
    conv1x1_output = conv1x1_model(small_tensor)
    linear_output = linear_model(small_tensor)

# 출력 비교
print("Conv1x1 output:\n", conv1x1_output[0][0][0][:8])
print("Linear output:\n", linear_output[0][0][0][:8])

Conv1x1 output:
 tensor([ 0.0235,  0.1586,  0.1314, -0.1478,  0.3107,  0.2606, -0.1566, -0.3747],
       device='cuda:0')
Linear output:
 tensor([ 0.0235,  0.1586,  0.1314, -0.1478,  0.3107,  0.2606, -0.1566, -0.3747],
       device='cuda:0')


In [5]:
import time

# 반복 실행을 위한 함수 정의
def repeat_experiment(model, input, repeats=1000):
    times = []
    for _ in range(repeats):
        start_time = time.time()
        model(input)
        torch.cuda.synchronize()  # GPU 연산이 완료될 때까지 기다림
        end_time = time.time()
        times.append(end_time - start_time)
    return times

# 모델 초기화
conv1x1_model, linear_model = initialize_models(C)

# 반복 실험
conv_times = repeat_experiment(conv1x1_model, x)
linear_times = repeat_experiment(linear_model, x)

# 평균 시간 계산
avg_time_conv = sum(conv_times) / len(conv_times)
avg_time_linear = sum(linear_times) / len(linear_times)

print(avg_time_conv, avg_time_linear)
print(f'conv 1x1이 linear에 비해 {avg_time_linear/avg_time_conv:.2f}배 빠릅니다')

0.00018616652488708497 0.0005698230266571045
conv 1x1이 linear에 비해 3.06배 빠릅니다
