In [16]:
using BenchmarkTools

In [37]:
function Convolution_2d(input, kernel; bias=0., padding=false)
    input_rows, input_columns = size(input)
    kernel_height, kernel_width = size(kernel)

    if padding
        padded_input = zeros(Float32, input_rows + 2*kernel_height - 2, input_columns + 2*kernel_width - 2)
        padded_input[kernel_height:end-kernel_height+1, kernel_width:end-kernel_width+1] .= input
        input_rows, input_columns = size(padded_input)
        input = padded_input
    end

    output_rows = input_rows - kernel_height + 1
    output_columns = input_columns - kernel_width + 1
    output = zeros(Float32, output_rows, output_columns)
    sumret = zeros(size(kernel))
    for c in 1:output_columns
        for r in 1:output_rows
            patch = @view input[r:r+kernel_height-1, c:c+kernel_width-1]
            sumret .= patch .* kernel
            output[r, c] = sum(sumret) + bias
            sumret .= 0.0
        end
    end
    return output
end

function Convolution_2d!(ret, input, kernel; bias=0., padding=false)
    input_rows, input_columns = size(input)
    kernel_height, kernel_width = size(kernel)

    if padding
        padded_input = zeros(Float32, input_rows + 2*kernel_height - 2, input_columns + 2*kernel_width - 2)
        padded_input[kernel_height:end-kernel_height+1, kernel_width:end-kernel_width+1] .= input
        input_rows, input_columns = size(padded_input)
        input = padded_input
    end

    output_rows = input_rows - kernel_height + 1
    output_columns = input_columns - kernel_width + 1
    sumret = zeros(size(kernel))
    for c in 1:output_columns
        for r in 1:output_rows
            patch = @view input[r:r+kernel_height-1, c:c+kernel_width-1]
            sumret .= patch .* kernel
            ret[r, c] = sum(sumret) + bias
            sumret .= 0.0
        end
    end
end

function Convolution_2d_v2!(ret, sumret, input, kernel; bias=0., padding=false)
    input_rows, input_columns = size(input)
    kernel_height, kernel_width = size(kernel)

    if padding
        padded_input = zeros(Float32, input_rows + 2*kernel_height - 2, input_columns + 2*kernel_width - 2)
        padded_input[kernel_height:end-kernel_height+1, kernel_width:end-kernel_width+1] .= input
        input_rows, input_columns = size(padded_input)
        input = padded_input
    end

    output_rows = input_rows - kernel_height + 1
    output_columns = input_columns - kernel_width + 1
    for c in 1:output_columns
        for r in 1:output_rows
            patch = @view input[r:r+kernel_height-1, c:c+kernel_width-1]
            @views sumret .= patch .* kernel
            ret[r, c] = sum(sumret) + bias
            sumret .= 0.0
        end
    end
end

Convolution_2d_v2! (generic function with 1 method)

In [18]:
function Conv_forward_v1(input, weights, bias)
    input_height, input_width, input_channels = size(input)
    kernel_height, kernel_width, _, output_channels = size(weights)

    output_height = input_height - kernel_height + 1
    output_width = input_width - kernel_width + 1
    output = zeros(Float32, output_height, output_width, output_channels)

    for k in 1:output_channels
        for c in 1:input_channels
            output[:, :, k] += Convolution_2d(input[:, :, c], weights[:, :, c, k]; bias=bias[k])
        end
    end
    
    return output
end

Conv_forward_v1 (generic function with 1 method)

In [19]:
function Conv_forward_v2(input, weights, bias)
    input_height, input_width, input_channels = size(input)
    kernel_height, kernel_width, _, output_channels = size(weights)

    output_height = input_height - kernel_height + 1
    output_width = input_width - kernel_width + 1
    output = zeros(Float32, output_height, output_width, output_channels)

    for k in 1:output_channels
        for c in 1:input_channels
            output[:, :, k] .+= Convolution_2d(input[:, :, c], weights[:, :, c, k]; bias=bias[k])
        end
    end
    
    return output
end

Conv_forward_v2 (generic function with 1 method)

In [20]:
function Conv_forward_v3(input, weights, bias)
    input_height, input_width, input_channels = size(input)
    kernel_height, kernel_width, _, output_channels = size(weights)

    output_height = input_height - kernel_height + 1
    output_width = input_width - kernel_width + 1
    output = zeros(Float32, output_height, output_width, output_channels)
    ret = zeros(Float32, output_height, output_width)

    for k in 1:output_channels
        for c in 1:input_channels
            Convolution_2d!(ret, input[:, :, c], weights[:, :, c, k]; bias=bias[k])
            output[:, :, k] .+= ret
            ret .= 0.0
        end
    end
    
    return output
end

Conv_forward_v3 (generic function with 1 method)

In [21]:
function Conv_forward_v4(input, weights, bias)
    input_height, input_width, input_channels = size(input)
    kernel_height, kernel_width, _, output_channels = size(weights)

    output_height = input_height - kernel_height + 1
    output_width = input_width - kernel_width + 1
    output = zeros(Float32, output_height, output_width, output_channels)
    ret = zeros(Float32, output_height, output_width)
    tmp_input = zeros(Float32, input_height, input_width)
    tmp_weights = zeros(Float32, kernel_height, kernel_width)
    for k in 1:output_channels
        for c in 1:input_channels
            tmp_input .= @views input[:, :, c]
            tmp_weights .= @views weights[:, :, c, k]
            Convolution_2d!(ret, tmp_input, tmp_weights; bias=bias[k])
            output[:, :, k] .+= ret
            ret .= 0.0
        end
    end
    
    return output
end

Conv_forward_v4 (generic function with 1 method)

In [22]:
function Conv_forward_v5(input, weights, bias)
  input_height, input_width, input_channels = size(input)
  kernel_height, kernel_width, _, output_channels = size(weights)

  output_height = input_height - kernel_height + 1
  output_width = input_width - kernel_width + 1
  output = zeros(Float32, output_height, output_width, output_channels)
  ret = zeros(Float32, output_height, output_width)
  tmp_input = zeros(Float32, input_height, input_width)
  tmp_weights = zeros(Float32, kernel_height, kernel_width)
  for k in 1:output_channels
      for c in 1:input_channels
          tmp_input .= @views input[:, :, c]
          tmp_weights .= @views weights[:, :, c, k]
          Convolution_2d!(ret, tmp_input, tmp_weights; bias=bias[k])
          @views output[:, :, k] .+= ret
          ret .= 0.0
      end
  end
  return output
end

Conv_forward_v5 (generic function with 1 method)

In [43]:
function Conv_forward_v6(input, weights, bias)
  input_height, input_width, input_channels = size(input)
  kernel_height, kernel_width, _, output_channels = size(weights)

  output_height = input_height - kernel_height + 1
  output_width = input_width - kernel_width + 1
  output = zeros(Float32, output_height, output_width, output_channels)
  ret = zeros(Float32, output_height, output_width)
  sumret = zeros(Float32, kernel_height, kernel_width)
  tmp_input = zeros(Float32, input_height, input_width)
  tmp_weights = zeros(Float32, kernel_height, kernel_width)
  
  for k in 1:output_channels
      for c in 1:input_channels
          tmp_input .= @views input[:, :, c]
          tmp_weights .= @views weights[:, :, c, k]
          Convolution_2d_v2!(ret, sumret, tmp_input, tmp_weights; bias=bias[k])
          @views output[:, :, k] .+= ret
          ret .= 0.0
      end
  end
  return output
end

Conv_forward_v6 (generic function with 1 method)

In [24]:
input = rand(Float32, 28, 28, 6);
weights = rand(Float32, 3, 3, 6, 16);
bias = rand(Float32, 16);

In [25]:
@benchmark Conv_forward_v1(input, weights, bias)

BenchmarkTools.Trial: 1492 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.988 ms[22m[39m … [35m 11.302 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m3.075 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m3.343 ms[22m[39m ± [32m719.110 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m1.69% ± 6.18%

  [39m█[39m▇[34m▄[39m[39m▃[39m▃[39m▃[32m▃[39m[39m▂[39m▁[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[34m█[39m[39m█[39m█[39m

In [26]:
@benchmark Conv_forward_v2(input, weights, bias)

BenchmarkTools.Trial: 1387 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.920 ms[22m[39m … [35m31.765 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 9.69%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m3.026 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m3.594 ms[22m[39m ± [32m 1.693 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m1.23% ± 5.26%

  [39m█[34m▅[39m[39m▃[39m▃[39m▂[39m▁[32m▁[39m[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[34m█[39m[39m█[39m█[39m█[39m█[32m█[

In [27]:
@benchmark Conv_forward_v3(input, weights, bias)

BenchmarkTools.Trial: 1632 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.869 ms[22m[39m … [35m 12.422 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.918 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m3.059 ms[22m[39m ± [32m595.500 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.89% ± 4.49%

  [39m█[34m▆[39m[39m▄[39m▂[32m▂[39m[39m▂[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[34m█[39m[39m█[39m█[32m█[39m

In [28]:
@benchmark Conv_forward_v4(input, weights, bias)

BenchmarkTools.Trial: 1719 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.825 ms[22m[39m … [35m  5.204 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.865 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.904 ms[22m[39m ± [32m206.370 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.44% ± 3.19%

  [39m█[39m▇[34m▆[39m[32m▄[39m[39m▄[39m▂[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[34m█[39m[32m█[39m[39m█

In [35]:
@benchmark Conv_forward_v5(input, weights, bias)

BenchmarkTools.Trial: 1758 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.788 ms[22m[39m … [35m  5.515 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.814 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.840 ms[22m[39m ± [32m132.170 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.03% ± 0.71%

  [39m▇[39m█[39m█[39m▇[34m▆[39m[39m▆[39m▅[32m▅[39m[39m▅[39m▃[39m▃[39m▂[39m▂[39m [39m [39m▂[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[39m█[39m█[39m█[34m█[39m[39m

In [44]:
@benchmark Conv_forward_v6(input, weights, bias)

BenchmarkTools.Trial: 1976 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.404 ms[22m[39m … [35m  6.773 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.457 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.525 ms[22m[39m ± [32m303.978 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.06% ± 1.18%

  [39m█[39m▆[34m▆[39m[39m▅[32m▄[39m[39m▃[39m▂[39m▂[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[34m█[39m[39m█[32m█[39m

In [47]:
Conv_forward_v5(input, weights, bias)[:,:,7]

26×26 Matrix{Float32}:
 13.1899  13.8141  13.8888  13.0893  …  16.7696  17.1326  16.6529  14.4808
 15.0014  14.03    14.3042  14.0787     17.06    15.151   15.8997  14.4785
 15.5495  14.7673  13.7993  13.2678     16.6863  15.1281  14.9938  17.0155
 15.0423  16.0034  12.7033  13.2653     13.4365  13.9975  14.3124  14.5605
 13.6778  14.6595  13.304   15.104      12.6148  13.4057  13.4714  14.9895
 12.938   14.6996  13.9507  14.4752  …  12.3357  12.1412  13.7018  14.7221
 13.3103  13.32    14.2551  14.8633     12.3885  12.7965  14.1683  15.1615
 15.9053  13.2583  14.7104  12.8796     14.2179  12.9727  13.2498  13.4117
 14.9228  14.576   13.4367  14.7599     14.7262  13.6605  13.5149  13.6022
 12.8906  13.1396  14.0459  13.3927     14.7379  14.749   14.3043  12.5551
 12.8063  12.5237  13.6428  12.6585  …  13.1847  14.7253  14.0855  12.7447
 13.136   12.1757  12.5388  10.4709     13.7467  12.8432  13.075   13.6431
 12.5201  12.4925  12.1964  11.0359     14.5689  11.9377  11.6893  12.1065
 1

In [48]:
Conv_forward_v6(input, weights, bias)[:,:,7]

26×26 Matrix{Float32}:
 13.1899  13.8141  13.8888  13.0893  …  16.7696  17.1326  16.6529  14.4808
 15.0014  14.03    14.3042  14.0787     17.06    15.151   15.8997  14.4785
 15.5495  14.7673  13.7993  13.2678     16.6863  15.1281  14.9938  17.0155
 15.0423  16.0034  12.7033  13.2653     13.4365  13.9975  14.3124  14.5605
 13.6778  14.6595  13.304   15.104      12.6148  13.4057  13.4714  14.9895
 12.938   14.6996  13.9507  14.4752  …  12.3357  12.1412  13.7018  14.7221
 13.3103  13.32    14.2551  14.8633     12.3885  12.7965  14.1683  15.1615
 15.9053  13.2583  14.7104  12.8796     14.2179  12.9727  13.2498  13.4117
 14.9228  14.576   13.4367  14.7599     14.7262  13.6605  13.5149  13.6022
 12.8906  13.1396  14.0459  13.3927     14.7379  14.749   14.3043  12.5551
 12.8063  12.5237  13.6428  12.6585  …  13.1847  14.7253  14.0855  12.7447
 13.136   12.1757  12.5388  10.4709     13.7467  12.8432  13.075   13.6431
 12.5201  12.4925  12.1964  11.0359     14.5689  11.9377  11.6893  12.1065
 1