# Speeding up Convolution

- toc: true
- badges: true
- comments: false
- categories: [jax, convolution]
- hide: true

## Introduction

## Import Libraries

For now, I only need numpy and tensorflow.

In [2]:
import numpy as np
import numba
import tensorflow as tf

## Implementation from First Principles

In [3]:
def filter_image(image, filters, strides):
    
    xm, xn, _  = image.shape 
    
    km, kn, ni, no = filters.shape 
    
    
    sm, sn = strides
    ym, yn = 1 + ((xm - km)//sm), 1 + ((xn - kn)//sn) 
    y = np.zeros((ym, yn, no))

    for iy, ix in enumerate(range(0, xm-km+1, sm)):
        for jy, jx in enumerate(range(0, xn-kn+1, sn)):
            # Apply each output filter and bias term to this chunk
            chunk = image[ix:ix+km,jx:jx+kn,:]
            for channel in range(no):
                y[iy,jy,channel] = np.sum(filters[:,:,:,channel] * chunk)
            
    return y

def filter_images_v1(batch, filters, biases, strides):
    outputs = [filter_image(image, filters, strides) for image in batch]
    outputs = np.array(outputs)
    outputs = outputs + biases
    return outputs

In [4]:
def filter_images_v2(images, filters, biases, strides):
    
    batch_size, xm, xn, _  = images.shape 
    
    km, kn, _, num_filters = filters.shape 
    
    
    sm, sn = strides
    ym, yn = 1 + ((xm - km)//sm), 1 + ((xn - kn)//sn) 
    y = np.zeros((batch_size, ym, yn, num_filters))

    # convolution step
    for iy, ix in enumerate(range(0, xm-km+1, sm)):
        for jy, jx in enumerate(range(0, xn-kn+1, sn)):
            # Apply each output filter and bias term to this chunk
            chunk = images[:,ix:ix+km,jx:jx+kn,:]
            for channel in range(num_filters):
                y[:,iy,jy,channel] = np.sum(filters[:,:,:,channel] * chunk, axis=(1,2,3))
                
    # add bias
    y += biases        
    return y

In [5]:
layer_keras = tf.keras.layers.Conv2D(
    filters=4, 
    kernel_size=(4, 4), 
    strides=(1,1), 
    bias_initializer='he_uniform', 
    padding='valid'
)

In [6]:
inputs = np.random.randn(32,28,28,3)

In [8]:
outputs_keras = layer_keras(inputs)

In [9]:
filters, biases = layer_keras.get_weights()
strides = layer_keras.strides

In [10]:
outputs_v1 = filter_images_v1(inputs, filters, biases, strides)

In [11]:
outputs_v2 = filter_images_v2(inputs, filters, biases, strides)

To check that the Keras output and numpy outputs are approximately equal, I make sure that the absolute error is below a threshold.

In [12]:
assert np.all(np.isclose(outputs_keras,outputs_v1, atol=1e-6)) 

In [13]:
assert np.all(np.isclose(outputs_keras,outputs_v2, atol=1e-6)) 

Although the outputs are about the same, the Keras version runs much faster, as the following benchmarks show.    

In [14]:
%%timeit -n 10
filter_images_v1(inputs, filters, biases, strides)

381 ms ± 6.39 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
%%timeit -n 10
filter_images_v2(inputs, filters, biases, strides)

23.3 ms ± 596 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%%timeit -n 10
layer_keras(inputs)

1.19 ms ± 354 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Using numba

In [25]:
def filter_images_v3(images, filters, biases, strides):
    
    batch_size, xm, xn, _  = images.shape 
    
    km, kn, num_input_channels, num_filters = filters.shape 
    
    
    sm, sn = strides
    ym, yn = 1 + ((xm - km)//sm), 1 + ((xn - kn)//sn) 
    y = np.zeros((batch_size, ym, yn, num_filters))

    # convolution step
    for iy, ix in enumerate(range(0, xm-km+1, sm)):
        for jy, jx in enumerate(range(0, xn-kn+1, sn)):
            # Apply each output filter and bias term to this chunk
            chunk = images[:,ix:ix+km,jx:jx+kn,:]
            for output_chan in range(num_filters):
                for item in range(batch_size):
                    tmp = 0.0
                    for h in range(km):
                        for w in range(kn):
                            for input_chan in range(num_input_channels):
                                tmp += chunk[item, h, w, input_chan] * filters[h, w, input_chan, output_chan]
                    y[item,iy,jy,output_chan] = tmp
                
    # add bias
    y += biases        
    return y

In [26]:
outputs_v3 = filter_images_v3(inputs, filters, biases, strides)

In [27]:
assert np.all(np.isclose(outputs_keras,outputs_v3, atol=1e-6)) 

In [28]:
%%timeit -n 1
filter_images_v3(inputs, filters, biases, strides)

2.02 s ± 60.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
jitted_filter_images_v3 = numba.jit(filter_images_v3)

In [30]:
jitted_outputs_v3 = jitted_filter_images_v3(inputs, filters, biases, strides)

In [31]:
assert np.all(np.isclose(outputs_keras,jitted_outputs_v3, atol=1e-6)) 

In [32]:
%%timeit -n 10
jitted_filter_images_v3(inputs, filters, biases, strides)

5.65 ms ± 331 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
