In [1]:
using BenchmarkTools

In [2]:
function Convolution_2d(input, kernel; bias=0., padding=false)
    input_rows, input_columns = size(input)
    kernel_height, kernel_width = size(kernel)

    if padding
        padded_input = zeros(Float32, input_rows + 2*kernel_height - 2, input_columns + 2*kernel_width - 2)
        padded_input[kernel_height:end-kernel_height+1, kernel_width:end-kernel_width+1] .= input
        input_rows, input_columns = size(padded_input)
        input = padded_input
    end

    output_rows = input_rows - kernel_height + 1
    output_columns = input_columns - kernel_width + 1
    output = zeros(Float32, output_rows, output_columns)
    sumret = zeros(size(kernel))
    for c in 1:output_columns
        for r in 1:output_rows
            patch = @view input[r:r+kernel_height-1, c:c+kernel_width-1]
            sumret .= patch .* kernel
            output[r, c] = sum(sumret) + bias
            sumret .= 0.0
        end
    end
    return output
end

function Convolution_2d!(ret, input, kernel; bias=0., padding=false)
    input_rows, input_columns = size(input)
    kernel_height, kernel_width = size(kernel)

    if padding
        padded_input = zeros(Float32, input_rows + 2*kernel_height - 2, input_columns + 2*kernel_width - 2)
        padded_input[kernel_height:end-kernel_height+1, kernel_width:end-kernel_width+1] .= input
        input_rows, input_columns = size(padded_input)
        input = padded_input
    end

    output_rows = input_rows - kernel_height + 1
    output_columns = input_columns - kernel_width + 1
    sumret = zeros(size(kernel))
    for c in 1:output_columns
        for r in 1:output_rows
            patch = @view input[r:r+kernel_height-1, c:c+kernel_width-1]
            sumret .= patch .* kernel
            ret[r, c] = sum(sumret) + bias
            sumret .= 0.0
        end
    end
end

function Convolution_2d_v2!(ret, sumret, input, kernel; bias=0., padding=false)
    input_rows, input_columns = size(input)
    kernel_height, kernel_width = size(kernel)

    if padding
        padded_input = zeros(Float32, input_rows + 2*kernel_height - 2, input_columns + 2*kernel_width - 2)
        padded_input[kernel_height:end-kernel_height+1, kernel_width:end-kernel_width+1] .= input
        input_rows, input_columns = size(padded_input)
        input = padded_input
    end

    output_rows = input_rows - kernel_height + 1
    output_columns = input_columns - kernel_width + 1
    for c in 1:output_columns
        for r in 1:output_rows
            patch = @view input[r:r+kernel_height-1, c:c+kernel_width-1]
            @views sumret .= patch .* kernel
            ret[r, c] = sum(sumret) + bias
            sumret .= 0.0
        end
    end
end

function Convolution_2d_v3!(ret, input, kernel; bias=0., padding=false)
    input_rows, input_columns = size(input)
    kernel_height, kernel_width = size(kernel)

    output_rows = input_rows - kernel_height + 1
    output_columns = input_columns - kernel_width + 1
    for c in 1:kernel_width
        for r in 1:kernel_height
            @views ret .+= input[r:r+input_rows-kernel_height, c:c+input_columns-kernel_width] .* kernel[r,c]
        end
    end
    ret .+= bias
end

Convolution_2d_v3! (generic function with 1 method)

In [3]:
function Conv_forward_v1(input, weights, bias)
    input_height, input_width, input_channels = size(input)
    kernel_height, kernel_width, _, output_channels = size(weights)

    output_height = input_height - kernel_height + 1
    output_width = input_width - kernel_width + 1
    output = zeros(Float32, output_height, output_width, output_channels)

    for k in 1:output_channels
        for c in 1:input_channels
            output[:, :, k] += Convolution_2d(input[:, :, c], weights[:, :, c, k]; bias=bias[k])
        end
    end
    
    return output
end

Conv_forward_v1 (generic function with 1 method)

In [4]:
function Conv_forward_v2(input, weights, bias)
    input_height, input_width, input_channels = size(input)
    kernel_height, kernel_width, _, output_channels = size(weights)

    output_height = input_height - kernel_height + 1
    output_width = input_width - kernel_width + 1
    output = zeros(Float32, output_height, output_width, output_channels)

    for k in 1:output_channels
        for c in 1:input_channels
            output[:, :, k] .+= Convolution_2d(input[:, :, c], weights[:, :, c, k]; bias=bias[k])
        end
    end
    
    return output
end

Conv_forward_v2 (generic function with 1 method)

In [5]:
function Conv_forward_v3(input, weights, bias)
    input_height, input_width, input_channels = size(input)
    kernel_height, kernel_width, _, output_channels = size(weights)

    output_height = input_height - kernel_height + 1
    output_width = input_width - kernel_width + 1
    output = zeros(Float32, output_height, output_width, output_channels)
    ret = zeros(Float32, output_height, output_width)

    for k in 1:output_channels
        for c in 1:input_channels
            Convolution_2d!(ret, input[:, :, c], weights[:, :, c, k]; bias=bias[k])
            output[:, :, k] .+= ret
            ret .= 0.0
        end
    end
    
    return output
end

Conv_forward_v3 (generic function with 1 method)

In [6]:
function Conv_forward_v4(input, weights, bias)
    input_height, input_width, input_channels = size(input)
    kernel_height, kernel_width, _, output_channels = size(weights)

    output_height = input_height - kernel_height + 1
    output_width = input_width - kernel_width + 1
    output = zeros(Float32, output_height, output_width, output_channels)
    ret = zeros(Float32, output_height, output_width)
    tmp_input = zeros(Float32, input_height, input_width)
    tmp_weights = zeros(Float32, kernel_height, kernel_width)
    for k in 1:output_channels
        for c in 1:input_channels
            tmp_input .= @views input[:, :, c]
            tmp_weights .= @views weights[:, :, c, k]
            Convolution_2d!(ret, tmp_input, tmp_weights; bias=bias[k])
            output[:, :, k] .+= ret
            ret .= 0.0
        end
    end
    
    return output
end

Conv_forward_v4 (generic function with 1 method)

In [7]:
function Conv_forward_v5(input, weights, bias)
  input_height, input_width, input_channels = size(input)
  kernel_height, kernel_width, _, output_channels = size(weights)

  output_height = input_height - kernel_height + 1
  output_width = input_width - kernel_width + 1
  output = zeros(Float32, output_height, output_width, output_channels)
  ret = zeros(Float32, output_height, output_width)
  tmp_input = zeros(Float32, input_height, input_width)
  tmp_weights = zeros(Float32, kernel_height, kernel_width)
  for k in 1:output_channels
      for c in 1:input_channels
          tmp_input .= @views input[:, :, c]
          tmp_weights .= @views weights[:, :, c, k]
          Convolution_2d!(ret, tmp_input, tmp_weights; bias=bias[k])
          @views output[:, :, k] .+= ret
          ret .= 0.0
      end
  end
  return output
end

Conv_forward_v5 (generic function with 1 method)

In [8]:
function Conv_forward_v6(input, weights, bias)
  input_height, input_width, input_channels = size(input)
  kernel_height, kernel_width, _, output_channels = size(weights)

  output_height = input_height - kernel_height + 1
  output_width = input_width - kernel_width + 1
  output = zeros(Float32, output_height, output_width, output_channels)
  ret = zeros(Float32, output_height, output_width)
  sumret = zeros(Float32, kernel_height, kernel_width)
  tmp_input = zeros(Float32, input_height, input_width)
  tmp_weights = zeros(Float32, kernel_height, kernel_width)
  
  for k in 1:output_channels
      for c in 1:input_channels
          tmp_input .= @views input[:, :, c]
          tmp_weights .= @views weights[:, :, c, k]
          Convolution_2d_v2!(ret, sumret, tmp_input, tmp_weights; bias=bias[k])
          @views output[:, :, k] .+= ret
          ret .= 0.0
      end
  end
  return output
end

Conv_forward_v6 (generic function with 1 method)

In [9]:
function Conv_forward_v7(input, weights, bias)
  input_height, input_width, input_channels = size(input)
  kernel_height, kernel_width, _, output_channels = size(weights)

  output_height = input_height - kernel_height + 1
  output_width = input_width - kernel_width + 1
  output = zeros(Float32, output_height, output_width, output_channels)
  ret = zeros(Float32, output_height, output_width)
  tmp_input = zeros(Float32, input_height, input_width)
  tmp_weights = zeros(Float32, kernel_height, kernel_width)
  
  for k in 1:output_channels
      for c in 1:input_channels
          tmp_input .= @views input[:, :, c]
          tmp_weights .= @views weights[:, :, c, k]
          Convolution_2d_v3!(ret, tmp_input, tmp_weights; bias=bias[k])
          @views output[:, :, k] .+= ret
          ret .= 0.0
      end
  end
  return output
end

Conv_forward_v7 (generic function with 1 method)

In [10]:
input = rand(Float32, 28, 28, 6);
weights = rand(Float32, 3, 3, 6, 16);
bias = rand(Float32, 16);

In [11]:
@benchmark Conv_forward_v1(input, weights, bias)

BenchmarkTools.Trial: 1406 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.991 ms[22m[39m … [35m 11.325 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 73.26%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m3.390 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m3.545 ms[22m[39m ± [32m937.936 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m4.48% ± 10.54%

  [39m▂[39m [39m▁[39m█[34m▇[39m[32m▄[39m[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m█[39m█[34m█[39m[3

In [12]:
@benchmark Conv_forward_v2(input, weights, bias)

BenchmarkTools.Trial: 1508 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.909 ms[22m[39m … [35m  9.160 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 66.91%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m3.213 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m3.312 ms[22m[39m ± [32m759.818 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m3.18% ±  8.74%

  [39m▆[39m▁[39m█[34m█[39m[32m▅[39m[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m█[34m█[39m[32m█[3

In [13]:
@benchmark Conv_forward_v3(input, weights, bias)

BenchmarkTools.Trial: 1686 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.872 ms[22m[39m … [35m  8.691 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 65.52%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.899 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.964 ms[22m[39m ± [32m316.215 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m1.05% ±  4.86%

  [39m█[34m▆[39m[39m▄[32m▂[39m[39m [39m [39m▄[39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[34m█[39m[39m█[32m█[39m[39

In [14]:
@benchmark Conv_forward_v4(input, weights, bias)

BenchmarkTools.Trial: 1718 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.833 ms[22m[39m … [35m  8.391 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 65.83%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.866 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.907 ms[22m[39m ± [32m213.905 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.50% ±  3.39%

  [39m▅[39m█[39m▁[34m [39m[39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m█[34m▅[39m[39m▇[3

In [15]:
@benchmark Conv_forward_v5(input, weights, bias)

BenchmarkTools.Trial: 1732 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.792 ms[22m[39m … [35m  5.652 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.826 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.885 ms[22m[39m ± [32m284.097 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.03% ± 0.69%

  [39m█[34m▆[39m[39m▆[32m▃[39m[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[34m█[39m[39m█[32m█[39m[39m█

In [16]:
@benchmark Conv_forward_v6(input, weights, bias)

BenchmarkTools.Trial: 2025 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.402 ms[22m[39m … [35m  5.250 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.434 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.467 ms[22m[39m ± [32m121.326 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.05% ± 1.06%

  [39m [39m█[39m▁[39m▅[39m [34m [39m[39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▅[39m█[39m█[39m█[39m█[34m▆[39

In [17]:
@benchmark Conv_forward_v7(input, weights, bias)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m325.900 μs[22m[39m … [35m 4.470 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 91.70%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m335.000 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m341.349 μs[22m[39m ± [32m60.977 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.48% ±  2.52%

  [39m [39m▅[39m█[39m [39m [39m▄[39m▇[34m [39m[39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▂[39m█[39m█[39m█[39

In [18]:
Conv_forward_v5(input, weights, bias)[:,:,7]

26×26 Matrix{Float32}:
 16.8845  18.2594  18.7286  18.822   …  16.9721  17.3893  18.4491  18.3908
 18.5714  16.517   18.1688  18.5788     17.277   18.1204  16.7065  16.8356
 18.7299  16.5257  17.0319  16.8556     17.6947  17.9055  16.5244  18.2157
 20.9102  19.3535  17.7175  15.0227     16.8224  16.8294  17.7     18.2396
 20.6838  20.5951  18.2939  15.821      16.9613  16.0222  17.1757  18.1366
 18.3121  19.088   17.2587  16.4646  …  16.5458  16.5134  15.3376  15.3293
 17.4604  19.5499  18.1163  16.7063     16.4121  15.2534  15.1732  17.5077
 16.6046  17.7405  18.2582  18.7267     16.9517  15.2968  17.4078  17.7513
 15.7168  14.7998  17.5295  18.6343     17.6783  16.0969  17.5986  17.7251
 17.1229  16.8894  16.3873  17.3629     17.0878  16.8594  17.6072  17.9415
 19.4259  18.1652  18.3607  18.1661  …  17.6095  19.1117  18.0503  18.7448
 18.962   19.97    18.8933  17.9475     17.985   19.0002  19.2493  18.8378
 18.5388  18.0807  17.5033  17.8047     19.0425  18.8824  18.1337  17.3338
 1

In [22]:
Conv_forward_v7(input, weights, bias)[:,:,7]

26×26 Matrix{Float32}:
 16.8845  18.2594  18.7286  18.822   …  16.972   17.3893  18.4491  18.3908
 18.5714  16.517   18.1688  18.5788     17.277   18.1204  16.7065  16.8356
 18.7299  16.5257  17.0319  16.8556     17.6947  17.9055  16.5244  18.2157
 20.9102  19.3535  17.7175  15.0227     16.8224  16.8294  17.7     18.2396
 20.6838  20.5951  18.2939  15.821      16.9613  16.0222  17.1757  18.1366
 18.3121  19.088   17.2587  16.4646  …  16.5458  16.5134  15.3376  15.3293
 17.4604  19.5499  18.1163  16.7063     16.4121  15.2534  15.1732  17.5077
 16.6046  17.7405  18.2582  18.7267     16.9517  15.2968  17.4078  17.7513
 15.7168  14.7998  17.5295  18.6343     17.6783  16.0969  17.5986  17.7251
 17.1229  16.8894  16.3873  17.3629     17.0878  16.8594  17.6072  17.9415
 19.4259  18.1652  18.3607  18.1661  …  17.6095  19.1117  18.0503  18.7448
 18.962   19.97    18.8933  17.9475     17.985   19.0002  19.2493  18.8378
 18.5388  18.0807  17.5033  17.8047     19.0425  18.8824  18.1337  17.3338
 1