In [4]:
import numpy as np
import nn

# Implementing max pooling
For the second part of the exercise you should implement a pooling operation. We will reuse the same example as in 2a only appling pooling rather than convolution.

First off implement pooling simliar to convolution as a function that takes an input tensor image and computes the max in small regions of size (pool_h, pool_w) with stride (sride_h, stride_w). In addition to computing the maximum you should also store the maximum locations. The easiest way to do this is to simply create a tensor that has the same size as the pool output and add two trailing dimensions for x and y position of the maximum (note that the maximum is channel wise). These switches are essential for implementing the backward pass later on!

In [12]:
def pool(imgs, poolout, switches, pool_h, pool_w, stride_y, stride_x):
    """
    Parameters:
    -----------
    imgs: input tensor of size (batch_size, chan_in, height, width)
    poolout: the output tensor of size (batch_size, chan_in, height//stride_y, width//stride_x)
    switches: binary encoding of maximum positions, we store them in a tensor of size
             (batch_size, chan_in, height//stride_y, width//stride_x, 2), 
             where the last two dimensions are used to specify y and x positions of the maximum element!
    pool_h: the height of the pooling regions
    pool_w: the width of the pooling regions
    stride_y: the step size in y direction (e.g. if you want non-overlapping pooling set stride_y = pool_h)
    stride_x: the step size in x direction
    
    """
    # TODO: implement pooling here
    pass

Once you implemented the above we can let it work on a simple minimal example of a (4,4) image with one channel.

In [13]:
img = np.asarray([[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16]], dtype=np.float64).reshape(1,1,4,4)
# remember the first dimension is the batch size here
# lets repeat the image so that we get a more useful test
imgs = np.repeat(img,2 , axis=0)
print(imgs.shape)

(2, 1, 4, 4)


In [21]:
# to test we will pool in 2x2 regions with stride 2
img_h = img.shape[2]
img_w = img.shape[3]
stride_y, stride_x = 2, 2
pool_h, pool_w = 2, 2
# this gives us output size
poolout_h = img_h // stride_y
poolout_w = img_w // stride_x
# since we are doing same convolutions the output should be the same size as the input
poolout = np.zeros((imgs.shape[0], imgs.shape[1], poolout_h, poolout_w))
# also create storage for the switches
switches = np.zeros(poolout.shape + (2,), dtype=np.int)
print(poolout.shape)
print(switches.shape)

(2, 1, 2, 2)
(2, 1, 2, 2, 2)


In [31]:
# apply the pooling
pool(imgs, poolout, switches, pool_h, pool_w, stride_y, stride_x)

In [25]:
# print the output and compare to the desired output
print(poolout)

[[[[  6.   8.]
   [ 14.  16.]]]


 [[[  6.   8.]
   [ 14.  16.]]]]


In [27]:
real_output = np.asarray(
[[[[  6.,   8.],
   [ 14.,  16.]]],
 [[[  6.,   8.],
   [ 14.,  16.]]]], dtype=np.float64)

In [29]:
diff = np.linalg.norm(real_output-poolout)
# the difference between those should be smaller than eps
eps = 1e-4
print("Diff {}".format(diff))
assert(diff < eps)

Diff 0.0


In [30]:
# we can also take a look at the switches
print(switches)

[[[[[1 1]
    [1 3]]

   [[3 1]
    [3 3]]]]



 [[[[1 1]
    [1 3]]

   [[3 1]
    [3 3]]]]]
