In [1]:
using BenchmarkTools

In [2]:
function Convolution_2d_v1(input, kernel, bias; padding=false)
    input_height, input_width = size(input)
    kernel_height, kernel_width = size(kernel)

    if padding
        tmp = zeros(input_height+2*kernel_height-2, input_width+2*kernel_width-2)
        for i in 1:input_height
            for j in 1:input_width
                tmp[i+kernel_height-1, j+kernel_width-1] = input[i,j]
            end
        end
        input = tmp
        input_height, input_width = size(input)
    end


    output_height = input_height - kernel_height + 1
    output_width = input_width - kernel_width + 1    
    output = zeros(output_height, output_width)

    for i in 1:output_height
        for j in 1:output_width
            patch = input[i:i+kernel_height-1, j:j+kernel_width-1]
            output[i, j] = sum(patch .* kernel) .+ bias
        end
    end
    return output
end

Convolution_2d_v1 (generic function with 1 method)

In [3]:
function Convolution_2d_v2(input, kernel; bias=0., padding=false)
    input_rows, input_columns = size(input)
    kernel_height, kernel_width = size(kernel)

    if padding
        padded_input = zeros(Float32, input_rows + 2*kernel_height - 2, input_columns + 2*kernel_width - 2)
        padded_input[kernel_height:end-kernel_height+1, kernel_width:end-kernel_width+1] .= input
        input_rows, input_columns = size(padded_input)
        input = padded_input
    end

    output_rows = input_rows - kernel_height + 1
    output_columns = input_columns - kernel_width + 1
    output = zeros(Float32, output_rows, output_columns)

    for c in 1:output_columns
        for r in 1:output_rows
            patch = @view input[r:r+kernel_height-1, c:c+kernel_width-1]
            output[r, c] = sum(patch .* kernel) + bias
        end
    end
    return output
end

Convolution_2d_v2 (generic function with 1 method)

In [4]:
function Convolution_2d_v3(input, kernel; bias=0., padding=false)
    input_rows, input_columns = size(input)
    kernel_height, kernel_width = size(kernel)

    if padding
        padded_input = zeros(Float32, input_rows + 2*kernel_height - 2, input_columns + 2*kernel_width - 2)
        padded_input[kernel_height:end-kernel_height+1, kernel_width:end-kernel_width+1] .= input
        input_rows, input_columns = size(padded_input)
        input = padded_input
    end

    output_rows = input_rows - kernel_height + 1
    output_columns = input_columns - kernel_width + 1
    output = zeros(Float32, output_rows, output_columns)
    sumret = zeros(size(kernel))
    for c in 1:output_columns
        for r in 1:output_rows
            patch = @view input[r:r+kernel_height-1, c:c+kernel_width-1]
            sumret .= patch .* kernel
            output[r, c] = sum(sumret) + bias
            sumret .= 0.0
        end
    end
    return output
end

Convolution_2d_v3 (generic function with 1 method)

In [5]:
function Convolution_2d_v4(input, kernel; bias=0.0f0, padding=false)
    input_rows, input_columns = size(input)
    kernel_height, kernel_width = size(kernel)

    if padding
        padded_input = zeros(Float32, input_rows + 2*kernel_height - 2, input_columns + 2*kernel_width - 2)
        padded_input[kernel_height:end-kernel_height+1, kernel_width:end-kernel_width+1] .= input
        input_rows, input_columns = size(padded_input)
        input = padded_input
    end

    output_rows = input_rows - kernel_height + 1
    output_columns = input_columns - kernel_width + 1
    output = zeros(Float32, output_rows, output_columns)
    sumret = zeros(Float32, size(kernel))
    for c in 1:output_columns
        for r in 1:output_rows
            patch = @view input[r:r+kernel_height-1, c:c+kernel_width-1]
            sumret .= @views (patch .* kernel)
            output[r, c] = sum(@views sumret) .+ bias
            sumret .= 0.0
        end
    end
    
    return output
end

Convolution_2d_v4 (generic function with 1 method)

In [19]:
function Convolution_2d_v5(input, kernel; bias=0.0f0, padding=false)
  input_rows, input_columns = size(input)
  kernel_height, kernel_width = size(kernel)

  output_rows = input_rows - kernel_height + 1
  output_columns = input_columns - kernel_width + 1
  output = zeros(Float32, output_rows, output_columns)
  sumret = zeros(Float32, size(kernel))
  for c in 1:kernel_width
      for r in 1:kernel_height
          @views output .+= input[r:r+input_rows-kernel_height, c:c+input_columns-kernel_width] .* kernel[r,c]
      end
  end
  output .+= bias
  return output
end

Convolution_2d_v5 (generic function with 1 method)

In [7]:
test_input = rand(Float32, 28,28);
test_kernel = rand(Float32, 3, 3);
test_bias = Float32(1);

In [8]:
@benchmark Convolution_2d_v1(test_input, test_kernel, 0)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m64.400 μs[22m[39m … [35m 1.615 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 93.75%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m66.300 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m69.590 μs[22m[39m ± [32m32.739 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m2.07% ±  4.48%

  [39m▃[39m█[34m▇[39m[39m▅[39m▆[39m▄[32m▂[39m[39m▁[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[34m█[39m[39m█[39m█[

In [9]:
@benchmark Convolution_2d_v2(test_input, test_kernel; bias=test_bias)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m40.900 μs[22m[39m … [35m811.300 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 89.74%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m42.000 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m44.565 μs[22m[39m ± [32m 21.829 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m1.62% ±  3.32%

  [39m█[34m█[39m[39m▅[39m▅[32m▅[39m[39m▃[39m▁[39m▁[39m▂[39m▃[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[34m█[39m[39m█[39m

In [10]:
@benchmark Convolution_2d_v3(test_input, test_kernel; bias=test_bias)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m28.400 μs[22m[39m … [35m560.400 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m28.700 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m30.530 μs[22m[39m ± [32m 11.055 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [34m█[39m[39m▅[32m▄[39m[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [34m█[39m[39m█[32m█[39m[

In [11]:
@benchmark Convolution_2d_v4(test_input, test_kernel; bias=test_bias)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m27.500 μs[22m[39m … [35m209.600 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m27.800 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m29.634 μs[22m[39m ± [32m 10.372 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [34m█[39m[39m▅[32m▃[39m[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [34m█[39m[39m█[32m█[39m[

In [20]:
@benchmark Convolution_2d_v5(test_input, test_kernel; bias=test_bias)

BenchmarkTools.Trial: 10000 samples with 9 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.967 μs[22m[39m … [35m262.500 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 97.69%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m3.056 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m3.347 μs[22m[39m ± [32m  5.468 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m3.58% ±  2.18%

  [39m▄[39m█[34m█[39m[39m▆[39m▄[39m▃[39m▂[39m▂[39m▄[39m▃[32m▂[39m[39m▁[39m [39m [39m [39m [39m▂[39m▂[39m▃[39m▂[39m▂[39m▂[39m▁[39m▁[39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[34m█[39m[39m█[39m█

In [21]:
Convolution_2d_v4(test_input, test_kernel; bias=test_bias)

26×26 Matrix{Float32}:
 3.31616  3.54165  4.53655  5.78159  …  4.78418  4.34526  3.91278  3.11735
 3.47476  3.49686  4.54649  4.41374     5.13466  4.4259   4.12785  4.03863
 4.35098  4.59843  4.49485  4.18861     5.48328  5.0303   4.62464  4.18723
 4.68939  4.78303  4.43213  3.98371     5.09493  4.82136  4.25325  3.99564
 3.99688  4.76486  4.4531   3.85592     4.19429  4.71409  4.42041  3.97164
 4.05574  4.09084  4.30282  4.01568  …  4.66287  5.45297  4.34295  4.00846
 4.16601  3.6552   3.94224  4.29355     5.67953  5.59144  5.06318  4.03557
 4.87622  4.91784  4.42584  4.53048     5.63082  5.02013  4.56985  4.11635
 4.57603  4.20388  4.26631  3.96454     4.51522  4.68304  4.98626  4.4571
 4.41809  4.1104   4.22103  3.34295     3.58215  3.64115  4.79044  4.64659
 3.69469  3.74608  3.08522  3.35963  …  2.56192  3.61458  4.89258  4.44091
 4.25192  3.34576  3.24736  3.78192     3.04317  3.2199   3.86653  4.88782
 4.15657  3.97821  3.61835  3.90713     4.23523  3.97355  3.97491  3.89724
 4.

In [22]:
Convolution_2d_v5(test_input, test_kernel; bias=test_bias)

26×26 Matrix{Float32}:
 3.31616  3.54165  4.53655  5.78159  …  4.78418  4.34526  3.91278  3.11735
 3.47476  3.49686  4.54649  4.41374     5.13466  4.4259   4.12785  4.03863
 4.35098  4.59843  4.49485  4.18861     5.48328  5.0303   4.62464  4.18723
 4.68939  4.78303  4.43213  3.98371     5.09493  4.82136  4.25325  3.99564
 3.99688  4.76486  4.4531   3.85592     4.19429  4.71409  4.42041  3.97164
 4.05574  4.09084  4.30282  4.01568  …  4.66287  5.45297  4.34295  4.00846
 4.16601  3.6552   3.94224  4.29355     5.67953  5.59144  5.06318  4.03557
 4.87622  4.91784  4.42584  4.53048     5.63082  5.02013  4.56985  4.11635
 4.57603  4.20388  4.26631  3.96454     4.51522  4.68304  4.98626  4.4571
 4.41809  4.1104   4.22103  3.34295     3.58215  3.64115  4.79044  4.64659
 3.69469  3.74608  3.08522  3.35963  …  2.56192  3.61458  4.89258  4.44091
 4.25192  3.34576  3.24736  3.78192     3.04317  3.2199   3.86653  4.88782
 4.15657  3.97821  3.61835  3.90713     4.23523  3.97355  3.97491  3.89724
 4.