<a href="https://colab.research.google.com/github/hail-members/distributed-deep-learning/blob/main/inference_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
import time

# Helper function to measure time taken by any operation (with averaging)
def measure_time(func, trials=5, *args):
    total_time = 0
    for _ in range(trials):
        start_time = time.time()
        result = func(*args)
        elapsed_time = time.time() - start_time
        total_time += elapsed_time
    avg_time = total_time / trials
    return result, avg_time

# Im2Col Convolution function (Manual Matrix Multiplication)
def im2col_convolution(input_tensor, weight, kernel_size, stride=1, padding=0):
    # Unfold input into columns (Im2col)
    input_unfolded = F.unfold(input_tensor, kernel_size=kernel_size, stride=stride, padding=padding)

    # Reshape the weight to match input shape
    weight_flattened = weight.view(weight.size(0), -1)

    # Matrix multiplication of unfolded input and flattened weight
    output = torch.matmul(weight_flattened, input_unfolded)

    # Calculate output height and width
    output_height = (input_tensor.size(2) + 2 * padding - kernel_size[0]) // stride + 1
    output_width = (input_tensor.size(3) + 2 * padding - kernel_size[1]) // stride + 1

    # Reshape back into output shape
    output = output.view(input_tensor.size(0), weight.size(0), output_height, output_width)

    return output

# Standard Convolution (Manual Matrix Multiplication)
def manual_convolution(input_tensor, weight, kernel_size, stride=1, padding=0):
    # Unfold input into columns (same as Im2Col)
    input_unfolded = F.unfold(input_tensor, kernel_size=kernel_size, stride=stride, padding=padding)

    # Perform matrix multiplication as in Im2Col, but using the same method for fair comparison
    weight_flattened = weight.view(weight.size(0), -1)
    output = torch.matmul(weight_flattened, input_unfolded)

    # Calculate output height and width
    output_height = (input_tensor.size(2) + 2 * padding - kernel_size[0]) // stride + 1
    output_width = (input_tensor.size(3) + 2 * padding - kernel_size[1]) // stride + 1

    # Reshape back into output shape
    output = output.view(input_tensor.size(0), weight.size(0), output_height, output_width)

    return output

# Increase the size of the input tensor
input_tensor = torch.randn(10, 512, 28, 28)  # Simulate a batch of 32 images of size 512x512 with 3 channels
weight = torch.randn(1024, 512, 3, 3)  # Example convolution weight (6 output channels, 3 input channels, 3x3 kernel)

# Measure the time taken for Im2Col Convolution (with 5 trials for averaging)
_, im2col_time = measure_time(im2col_convolution, 5, input_tensor, weight, (3, 3))

# Measure the time taken for Manual Convolution (with 5 trials for averaging)
_, manual_time = measure_time(manual_convolution, 5, input_tensor, weight, (3, 3))

# Output the result
print(f"Im2Col Convolution Time (Average over 5 trials): {im2col_time:.6f} seconds")
print(f"Manual Convolution Time (Average over 5 trials): {manual_time:.6f} seconds")


Im2Col Convolution Time (Average over 5 trials): 1.034854 seconds
Manual Convolution Time (Average over 5 trials): 1.128107 seconds


In-place depth-wise conv

In [None]:
import torch
import torch.nn as nn
import time

# Helper function to measure time taken by any operation
def measure_time(func, *args, trials=5):
    total_time = 0
    for _ in range(trials):
        start_time = time.time()
        func(*args)
        elapsed_time = time.time() - start_time
        total_time += elapsed_time
    avg_time = total_time / trials
    return avg_time

# Depth-wise convolution with in-place operations
class InplaceDepthwiseConv(nn.Module):
    def __init__(self, in_channels, kernel_size, padding=1):
        super(InplaceDepthwiseConv, self).__init__()
        self.depthwise_conv = nn.Conv2d(in_channels, in_channels, padding=padding, kernel_size=kernel_size, groups=in_channels)

    def forward(self, x):
        # 입력 텐서에 직접 덮어쓰는 In-place 연산
        x = self.depthwise_conv(x).copy_(x)
        return x

# Standard Depth-wise convolution (without in-place operation)
class StandardDepthwiseConv(nn.Module):
    def __init__(self, in_channels, kernel_size, padding=1):
        super(StandardDepthwiseConv, self).__init__()
        self.depthwise_conv = nn.Conv2d(in_channels, in_channels, kernel_size=kernel_size, padding=padding,  groups=in_channels)

    def forward(self, x):
        x = self.depthwise_conv(x)
        return x
# Input tensor (e.g., image batch of size 64x128x128 with 3 channels)
input_tensor = torch.randn(200, 512, 28, 28)

# Initialize both in-place and standard depth-wise convolution models
inplace_conv = InplaceDepthwiseConv(in_channels=512, kernel_size=3)
standard_conv = StandardDepthwiseConv(in_channels=512, kernel_size=3)

# Measure the time for in-place depth-wise convolution
inplace_time = measure_time(inplace_conv.forward, input_tensor)
print(f"In-place Depth-wise Convolution Time: {inplace_time:.6f} seconds")

# Measure the time for standard depth-wise convolution
standard_time = measure_time(standard_conv.forward, input_tensor)
print(f"Standard Depth-wise Convolution Time: {standard_time:.6f} seconds")

print("차이가 별로 없다! 왜냐하면 이건 메모리를 덜쓰는거기 때문에...")


In-place Depth-wise Convolution Time: 0.698361 seconds
Standard Depth-wise Convolution Time: 0.816289 seconds
차이가 별로 없다! 왜냐하면 이건 메모리를 덜쓰는거기 때문에...




입력 텐서 $x$가 주어졌을 때,

1. **Depth-wise Convolution**:
   $$
   x_{\text{conv}} = W_{\text{depthwise}} * x
   $$

2. **In-place 연산**:
   $$
   x_{\text{conv}} = W_{\text{depthwise}} * x_{\text{conv}} \\
   x_{\text{result}} = x_{\text{conv}} + x_{\text{conv}}
   $$



In [None]:
# GPU 메모리 사용량 측정 함수
def measure_gpu_memory(func, *args, trials=5):
    torch.cuda.reset_peak_memory_stats()  # 피크 메모리 통계 초기화
    torch.cuda.empty_cache()  # 캐시 초기화

    for _ in range(trials):
        func(*args)

    current_memory = torch.cuda.memory_allocated()  # 현재 메모리 사용량
    peak_memory = torch.cuda.max_memory_allocated()  # 피크 메모리 사용량

    return current_memory, peak_memory

# 입력 텐서 생성
input_tensor = torch.randn(1, 3, 28, 28).cuda()
inplace_conv = InplaceDepthwiseConv(in_channels=3, kernel_size=3).cuda()
standard_conv = StandardDepthwiseConv(in_channels=3, kernel_size=3).cuda()

# Measure GPU memory usage for in-place depth-wise convolution
current_memory, peak_memory = measure_gpu_memory(inplace_conv.forward, input_tensor)
print(f"In-place Depth-wise Convolution GPU Memory Usage: Current: {current_memory / 10**6:.4f} MB, Peak: {peak_memory / 10**6:.4f} MB")

# Measure GPU memory usage for standard depth-wise convolution
current_memory, peak_memory = measure_gpu_memory(standard_conv.forward, input_tensor)
print(f"Standard Depth-wise Convolution GPU Memory Usage: Current: {current_memory / 10**6:.4f} MB, Peak: {peak_memory / 10**6:.4f} MB")

In-place Depth-wise Convolution GPU Memory Usage: Current: 321.2426 MB, Peak: 321.2524 MB
Standard Depth-wise Convolution GPU Memory Usage: Current: 321.2426 MB, Peak: 321.2524 MB


왜? torch.conv2d 는 이미 이런것 고려해서 만들어졌기 때문에 메모리 차이가 나지 않는다.