In [66]:
using BenchmarkTools

In [67]:
function Convolution_2d(input, kernel; bias=0., padding=false)
    input_rows, input_columns = size(input)
    kernel_height, kernel_width = size(kernel)

    if padding
        padded_input = zeros(Float32, input_rows + 2*kernel_height - 2, input_columns + 2*kernel_width - 2)
        padded_input[kernel_height:end-kernel_height+1, kernel_width:end-kernel_width+1] .= input
        input_rows, input_columns = size(padded_input)
        input = padded_input
    end

    output_rows = input_rows - kernel_height + 1
    output_columns = input_columns - kernel_width + 1
    output = zeros(Float32, output_rows, output_columns)
    sumret = zeros(size(kernel))
    for c in 1:output_columns
        for r in 1:output_rows
            patch = @view input[r:r+kernel_height-1, c:c+kernel_width-1]
            sumret .= patch .* kernel
            output[r, c] = sum(sumret) + bias
            sumret .= 0.0
        end
    end
    return output
end

function Convolution_2d!(ret, input, kernel; bias=0., padding=false)
    input_rows, input_columns = size(input)
    kernel_height, kernel_width = size(kernel)

    if padding
        padded_input = zeros(Float32, input_rows + 2*kernel_height - 2, input_columns + 2*kernel_width - 2)
        padded_input[kernel_height:end-kernel_height+1, kernel_width:end-kernel_width+1] .= input
        input_rows, input_columns = size(padded_input)
        input = padded_input
    end

    output_rows = input_rows - kernel_height + 1
    output_columns = input_columns - kernel_width + 1
    sumret = zeros(size(kernel))
    for c in 1:output_columns
        for r in 1:output_rows
            patch = @view input[r:r+kernel_height-1, c:c+kernel_width-1]
            sumret .= patch .* kernel
            ret[r, c] = sum(sumret) + bias
            sumret .= 0.0
        end
    end
end

Convolution_2d! (generic function with 1 method)

In [68]:
function Conv_forward_v1(input, weights, bias)
    input_height, input_width, input_channels = size(input)
    kernel_height, kernel_width, _, output_channels = size(weights)

    output_height = input_height - kernel_height + 1
    output_width = input_width - kernel_width + 1
    output = zeros(Float32, output_height, output_width, output_channels)

    for k in 1:output_channels
        for c in 1:input_channels
            output[:, :, k] += Convolution_2d(input[:, :, c], weights[:, :, c, k]; bias=bias[k])
        end
    end
    
    return output
end

Conv_forward_v1 (generic function with 1 method)

In [69]:
function Conv_forward_v2(input, weights, bias)
    input_height, input_width, input_channels = size(input)
    kernel_height, kernel_width, _, output_channels = size(weights)

    output_height = input_height - kernel_height + 1
    output_width = input_width - kernel_width + 1
    output = zeros(Float32, output_height, output_width, output_channels)

    for k in 1:output_channels
        for c in 1:input_channels
            output[:, :, k] .+= Convolution_2d(input[:, :, c], weights[:, :, c, k]; bias=bias[k])
        end
    end
    
    return output
end

Conv_forward_v2 (generic function with 1 method)

In [70]:
function Conv_forward_v3(input, weights, bias)
    input_height, input_width, input_channels = size(input)
    kernel_height, kernel_width, _, output_channels = size(weights)

    output_height = input_height - kernel_height + 1
    output_width = input_width - kernel_width + 1
    output = zeros(Float32, output_height, output_width, output_channels)
    ret = zeros(Float32, output_height, output_width)

    for k in 1:output_channels
        for c in 1:input_channels
            Convolution_2d!(ret, input[:, :, c], weights[:, :, c, k]; bias=bias[k])
            output[:, :, k] .+= ret
            ret .= 0.0
        end
    end
    
    return output
end

Conv_forward_v3 (generic function with 1 method)

In [71]:
function Conv_forward_v4(input, weights, bias)
    input_height, input_width, input_channels = size(input)
    kernel_height, kernel_width, _, output_channels = size(weights)

    output_height = input_height - kernel_height + 1
    output_width = input_width - kernel_width + 1
    output = zeros(Float32, output_height, output_width, output_channels)
    ret = zeros(Float32, output_height, output_width)
    tmp_input = zeros(Float32, input_height, input_width)
    tmp_weights = zeros(Float32, kernel_height, kernel_width)
    for k in 1:output_channels
        for c in 1:input_channels
            tmp_input .= @views input[:, :, c]
            tmp_weights .= @views weights[:, :, c, k]
            Convolution_2d!(ret, tmp_input, tmp_weights; bias=bias[k])
            output[:, :, k] .+= ret
            ret .= 0.0
        end
    end
    
    return output
end

Conv_forward_v4 (generic function with 1 method)

In [72]:
function Conv_forward_v5(input, weights, bias)
  input_height, input_width, input_channels = size(input)
  kernel_height, kernel_width, _, output_channels = size(weights)

  output_height = input_height - kernel_height + 1
  output_width = input_width - kernel_width + 1
  output = zeros(Float32, output_height, output_width, output_channels)
  ret = zeros(Float32, output_height, output_width)
  tmp_input = zeros(Float32, input_height, input_width)
  tmp_weights = zeros(Float32, kernel_height, kernel_width)
  for k in 1:output_channels
      for c in 1:input_channels
          tmp_input .= @views input[:, :, c]
          tmp_weights .= @views weights[:, :, c, k]
          Convolution_2d!(ret, tmp_input, tmp_weights; bias=bias[k])
          @views output[:, :, k] .+= ret
          ret .= 0.0
      end
  end
  return output
end

Conv_forward_v5 (generic function with 1 method)

In [73]:
input = rand(Float32, 28, 28, 6);
weights = rand(Float32, 3, 3, 6, 16);
bias = rand(Float32, 16);

In [74]:
@benchmark Conv_forward_v1(input, weights, bias)

BenchmarkTools.Trial: 1210 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.979 ms[22m[39m … [35m14.569 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 27.43%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m3.410 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m4.114 ms[22m[39m ± [32m 1.480 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m1.69% ±  6.12%

  [39m█[39m▁[39m [39m [34m [39m[39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m▆[39m▅[34m▄[39m[39m▄[39m▃

In [75]:
@benchmark Conv_forward_v2(input, weights, bias)

BenchmarkTools.Trial: 1456 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.912 ms[22m[39m … [35m 20.399 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m3.087 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m3.424 ms[22m[39m ± [32m987.367 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m1.34% ± 5.43%

  [39m█[39m▆[39m▅[34m▄[39m[39m▄[39m▄[39m▃[39m▂[32m▂[39m[39m▂[39m▁[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m█[34m█[39m[39m█[39m

In [76]:
@benchmark Conv_forward_v3(input, weights, bias)

BenchmarkTools.Trial: 1642 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.878 ms[22m[39m … [35m  6.762 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 28.20%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.941 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m3.039 ms[22m[39m ± [32m320.069 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.86% ±  4.42%

  [39m█[39m▇[34m▆[39m[39m▄[39m▄[32m▄[39m[39m▄[39m▃[39m▁[39m▁[39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[34m█[39m[39m█[39m█[3

In [77]:
@benchmark Conv_forward_v4(input, weights, bias)

BenchmarkTools.Trial: 1683 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.832 ms[22m[39m … [35m  9.047 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 67.29%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.903 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.965 ms[22m[39m ± [32m247.895 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.51% ±  3.43%

  [39m█[39m▆[39m▃[39m▄[34m [39m[39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m█[39m█[34m█[39m[3

In [78]:
@benchmark Conv_forward_v5(input, weights, bias)

BenchmarkTools.Trial: 1721 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.798 ms[22m[39m … [35m  6.095 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.848 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.900 ms[22m[39m ± [32m173.544 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.04% ± 0.78%

  [39m█[39m█[39m▆[39m▆[34m▆[39m[39m▆[39m▅[39m▄[32m▃[39m[39m▃[39m▂[39m▂[39m▂[39m▁[39m▃[39m▂[39m▃[39m▂[39m [39m▂[39m▂[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[39m█[39m█[39m█[34m█[39m[39m