In [1]:
import pickle
import numpy as np

with open('../data/conv_cases.pkl', 'rb') as conv_f, \
             open('../data/max_pool_cases.pkl', 'rb') as max_pool_f, \
             open('../data/fc_cases.pkl', 'rb') as fc_f:
            conv_cases = pickle.load(conv_f)
            max_pool_cases = pickle.load(max_pool_f)
            fc_cases = pickle.load(fc_f)

In [2]:
"""All the layer functions go here.

"""

from __future__ import print_function, absolute_import
import numpy as np


class FullyConnected(object):
    """Fully connected layer 'y = Wx + b'.

    Arguments:
        shape (tuple): the shape of the fully connected layer. shape[0] is the
            output size and shape[1] is the input size.
        weights_init (obj):  an object instantiated using any initializer class
                in the "initializer" module.
        bias_init (obj):  an object instantiated using any initializer class
                in the "initializer" module.
        name (str): the name of the layer.

    Attributes:
        W (np.array): the weights of the fully connected layer.
        b (np.array): the biases of the fully connected layer.
        shape (tuple): the shape of the fully connected layer. shape[0] is the
            output size and shape[1] is the input size.
        name (str): the name of the layer.

    """

    def __init__(
        self, d_in, d_out, weights_init=None, bias_init=None, name="FullyConnected"
    ):
        shape = (d_out, d_in)
        self.W = weights_init.initialize(shape) \
            if weights_init else np.random.randn(*shape).astype(np.float32)
        self.b = bias_init.initialize((shape[0])) \
            if bias_init else np.random.randn(shape[0]).astype(np.float32)
        self.shape = shape
        self.name = name

    def __repr__(self):
        return "{}({}, {})".format(self.name, self.shape[0], self.shape[1])

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        """Compute the layer output.

        Args:
            x (np.array): the input of the layer.

        Returns:
            The output of the layer.

        """
        Y = np.dot(self.W, x) + self.b
        return Y

    def backward(self, x, dv_y):
        """Compute the gradients of weights and biases and the gradient with
        respect to the input.

        Args:
            x (np.array): the input of the layer.
            dv_y (np.array): The derivative of the loss with respect to the
                output.

        Returns:
            dv_x (np.array): The derivative of the loss with respect to the
                input.
            dv_W (np.array): The derivative of the loss with respect to the
                weights.
            dv_b (np.array): The derivative of the loss with respect to the
                biases.

        """

        # TODO: write your implementation below
        dv_x = np.empty(x.shape, dtype=np.float32)
        dv_W = np.empty(self.W.shape, dtype=np.float32)
        dv_b = np.empty(self.b.shape, dtype=np.float32)

        # don't change the order of return values
        dv_b = dv_y
        dv_W = np.outer(dv_y.T, x)
        dv_x = np.dot(self.W.T, dv_y)
        
        return dv_x, dv_W, dv_b

class Conv2D(object):
    """2D convolutional layer.

    Arguments:
        filter_size (tuple): the shape of the filter. It is a tuple = (
            out_channels, in_channels, filter_height, filter_width).
        strides (int or tuple): the strides of the convolution operation.
            padding (int or tuple): number of zero paddings.
        weights_init (obj):  an object instantiated using any initializer class
                in the "initializer" module.
        bias_init (obj):  an object instantiated using any initializer class
                in the "initializer" module.
        name (str): the name of the layer.

    Attributes:
        W (np.array): the weights of the layer. A 4D array of shape (
            out_channels, in_channels, filter_height, filter_width).
        b (np.array): the biases of the layer. A 1D array of shape (
            in_channels).
        filter_size (tuple): the shape of the filter. It is a tuple = (
            out_channels, in_channels, filter_height, filter_width).
        strides (tuple): the strides of the convolution operation. A tuple = (
            height_stride, width_stride).
        padding (tuple): the number of zero paddings along the height and
            width. A tuple = (height_padding, width_padding).
        name (str): the name of the layer.

    """

    def __init__(
            self, in_channel, out_channel, kernel_size, stride, padding,
            weights_init=None, bias_init=None, name="Conv2D"):
        filter_size = (out_channel, in_channel, *kernel_size)

        self.W = weights_init.initialize(filter_size) \
            if weights_init else np.random.randn(*filter_size).astype(np.float32)
        self.b = bias_init.initialize((filter_size[0], 1)) \
            if bias_init else np.random.randn(out_channel, 1).astype(np.float32)

        self.kernel_size = kernel_size
        self.stride = (stride, stride) if type(stride) == int else stride
        self.padding = (padding, padding) if type(padding) == int else padding
        self.name = name

    def __repr__(self):
        return "{}({}, {}, {})".format(
            self.name, self.kernel_size, self.stride, self.padding
        )

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        """Compute the layer output.

        Args:
            x (np.array): the input of the layer. A 3D array of shape (
                in_channels, in_heights, in_weights).

        Returns:
            The output of the layer. A 3D array of shape (out_channels,
                out_heights, out_weights).

        """
        p, s = self.padding, self.stride
        x_padded = np.pad(
            x, ((0, 0), (p[0], p[0]), (p[1], p[1])), mode='constant'
        )

        # check dimensions
        assert (x.shape[1] - self.W.shape[2] + 2 * p[0]) / s[0] + 1 > 0, \
                'Height doesn\'t work'
        assert (x.shape[2] - self.W.shape[3] + 2 * p[1]) / s[1] + 1 > 0, \
                'Width doesn\'t work'

        y_shape = (
            self.W.shape[0],
            int((x.shape[1] - self.W.shape[2] + 2 * p[0]) / s[0]) + 1,
            int((x.shape[2] - self.W.shape[3] + 2 * p[1]) / s[1]) + 1,
        )
        y = np.empty(y_shape, dtype=np.float32)

        for k in range(y.shape[0]):
            for i in range(y.shape[1]):
                for j in range(y.shape[2]):
                    y[k, i, j] = np.sum(
                        x_padded[
                            :,
                            i * s[0] : i * s[0] + self.W.shape[2],
                            j * s[1] : j * s[1] + self.W.shape[3]
                        ] * self.W[k]
                    ) + self.b[k]
        return y

    def backward(self, x, dv_y):
        """Compute the gradients of weights and biases and the gradient with
        respect to the input.

        Args:
            x (np.array): the input of the layer. A 3D array of shape (
                in_channels, in_heights, in_weights).
            dv_y (np.array): The derivative of the loss with respect to the
                output. A 3D array of shape (out_channels, out_heights,
                out_weights).

        Returns:
            dv_x (np.array): The derivative of the loss with respect to the
                input. It has the same shape as x.
            dv_W (np.array): The derivative of the loss with respect to the
                weights. It has the same shape as self.W
            dv_b (np.array): The derivative of the loss with respect to the
                biases. It has the same shape as self.b

        """
        p, s = self.padding, self.stride
        x_padded = np.pad(
            x, ((0, 0), (p[0], p[0]), (p[1], p[1])), mode='constant'
        )

        # TODO: write your implementation below

        dv_W = np.zeros(self.W.shape, dtype=np.float64)
        dv_b = np.zeros(self.b.shape, dtype=np.float64)
        dv_x = np.zeros(x.shape, dtype=np.float64)
        
            ###dv_b###
        for i in range(dv_y.shape[0]):
            
            dv_b[i] = np.sum(dv_y[i])
            ###dv_b###
            
            ###dv_W###
        for k in range(dv_W.shape[0]):
            
            for t in range(x.shape[0]):
                
                for i in range(dv_y.shape[1]):
                    for j in range(dv_y.shape[2]):                       
                
                        for m in range(self.W.shape[2]):
                            for n in range(self.W.shape[3]):
                                
                                dv_W[k][t][m][n] += np.multiply(x[t][s[0] * m + i][s[1] * n + j], dv_y[k][i][j])
            ###dv_W###
            
            ###dv_X###
        #dv_y_padded = np.pad(dv_y, ((0, 0), (self.W.shape[2] - 1, self.W.shape[2] - 1), (self.W.shape[3] - 1, self.W.shape[3] - 1)), mode='constant')

        for k in range(self.W.shape[0]):
            
            for t in range(self.W.shape[1]):
                
                for i in range(dv_y.shape[1]):
                    for j in range(dv_y.shape[2]):
                
                        for m in range(self.W.shape[2]):
                            for n in range(self.W.shape[3]):
                                
                                dv_x[t][i + m][j + n] += self.W[k][t][m][n] * dv_y[k][i][j]
                                
                                #dv_x[t][i][j] += self.W[k][t][-(m + 1)][-(n + 1)] * dv_y_padded[k][i//s[0] + m][j//s[1] + n]
            ###dv_X###
                        

        # don't change the order of return values
        return dv_x, dv_W, dv_b

class MaxPool2D:
    def __init__(self, kernel_size, stride, padding, name="MaxPool2D"):
        self.kernel_size = kernel_size
        self.stride = (stride, stride) if type(stride) == int else stride
        self.padding = (padding, padding) if type(padding) == int else padding
        self.name = name

    def __repr__(self):
        return "{}({}, {}, {})".format(
            self.name, self.kernel_size, self.stride, self.padding
        )

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        """Compute the layer output.

        Arguments:
            x {[np.array]} -- the input of the layer. A 3D array of shape (
                              in_channels, in_heights, in_weights).
        Returns:
            The output of the layer. A 3D array of shape (out_channels,
                out_heights, out_weights).
        """
        p, s = self.padding, self.stride
        x_padded = np.pad(
            x, ((0, 0), (p[0], p[0]), (p[1], p[1])), mode='constant'
        )

        # check dimensions
        assert (x.shape[1] - self.kernel_size[0] + 2 * p[0]) / s[0] + 1 > 0, \
            'Height doesn\'t work'
        assert (x.shape[2] - self.kernel_size[1] + 2 * p[1]) / s[1] + 1 > 0, \
            'Width doesn\'t work'

        y_shape = (
            x.shape[0],
            int((x.shape[1] - self.kernel_size[0] + 2 * p[0]) / s[0]) + 1,
            int((x.shape[2] - self.kernel_size[1] + 2 * p[1]) / s[1]) + 1,
        )
        y = np.empty(y_shape, dtype=np.float32)

        for i in range(y.shape[1]):
            for j in range(y.shape[2]):
                y[:, i, j] = np.max(x_padded[
                                    :,
                                    i * s[0]: i * s[0] + self.kernel_size[0],
                                    j * s[1]: j * s[1] + self.kernel_size[1]
                                    ].reshape(-1, self.kernel_size[0] * self.kernel_size[1]),
                                    axis=1
                                    )

        return y

    def backward(self, x, dv_y):
        """Compute the gradients of weights and biases and the gradient with
                respect to the input.

                Args:
                    x (np.array): the input of the layer. A 3D array of shape (
                        in_channels, in_heights, in_weights).
                    dv_y (np.array): The derivative of the loss with respect to the
                        output. A 3D array of shape (out_channels, out_heights,
                        out_weights).

                Returns:
                    dv_x (np.array): The derivative of the loss with respect to the
                        input. It has the same shape as x.
                """
        p, s = self.padding, self.stride
        x_padded = np.pad(
            x, ((0, 0), (p[0], p[0]), (p[1], p[1])), mode='constant'
        )

        # TODO: write your implementation below
        #dv_x = np.empty(x.shape, dtype=np.float32)
        dv_x = np.zeros(x.shape, dtype=np.float64)
        
        for t in range(x.shape[0]):
            
            for i in range(dv_y.shape[1]):
                for j in range(dv_y.shape[2]):
                    
                    current_max = -999_999_999
                    coords = [None, None]
                    
                    for m in range(self.kernel_size[0]):
                        for n in range(self.kernel_size[1]):
                            
                            if current_max < x_padded[t][i * s[0] + m][j * s[1] + n]:
                                current_max = x_padded[t][i * s[0] + m][j * s[1] + n]
                                coords[0] = m + i * s[0]
                                coords[1] = n + j * s[1]
                    
                    dv_x[t][coords[0]][coords[1]] += dv_y[t][i][j]
                    
        return dv_x

In [11]:
for case in conv_cases:
    weight = case['weight']
    out_c, in_c, h, w = weight.shape
    bias = case['bias']
    x = case['x']
    out = case['out']
    stride = case['stride']
    pad = case['pad']
    grad_output = case['grad_output']
    grad_x = case['grad_x']
    grad_w = case['grad_w']
    grad_b = case['grad_b']

    conv = Conv2D(in_channel=in_c,
                  out_channel=out_c,
                  kernel_size=(h, w),
                  stride=stride,
                  padding=pad)
    conv.W = weight
    conv.b = bias
    test_out = conv(x)
    dv_x, dv_W, dv_b = conv.backward(x, grad_output)
    
    print(np.allclose(grad_x, dv_x, rtol=0.0001))
    print(np.allclose(grad_w, dv_W, rtol=0.0001))
    print(np.allclose(grad_b, dv_b, rtol=0.0001))
    print('\n\n')

True
True
True



True
True
True



True
True
True





In [12]:
print(dv_x)

[[[-1.15467713e-01 -2.55054446e-01 -1.88274135e-01  1.77041068e-01
    1.09686033e+00  5.64804397e-01  4.01296492e-01  4.64555789e-01
    2.75689773e-01  4.22841851e-02  2.94088820e-01 -4.49973862e-01
    1.82049539e-01  5.32507584e-01  3.60694276e-01  3.10940458e-02
   -1.66649012e-01  5.02818815e-01  8.73779701e-02  6.77837385e-02
   -3.16725380e-01 -3.23223080e-01 -2.99189419e-01]
  [-3.34615381e-01  6.21533096e-02  6.43994516e-01 -7.49668491e-01
   -3.41789896e-01 -4.12406493e-01 -4.94804570e-01 -3.22321763e-02
   -6.93489696e-02  2.37267772e-01 -9.65248972e-01  1.27701937e+00
   -1.01153437e+00  5.64872457e-01 -8.01300549e-01  3.10140738e-01
    3.84574319e-01  7.54025076e-01 -1.13202819e+00 -1.07711572e+00
   -3.72776708e-01  8.16722354e-01  3.87704644e-01]
  [-1.55423583e-01 -8.75054419e-01  2.37540173e-01  8.01017160e-01
    6.52695438e-01 -6.99188388e-01 -8.84186700e-01 -7.93934746e-01
    8.48170402e-01 -2.91464847e-01 -9.94580250e-01  5.11115480e-01
    2.05940167e-01  1.476

In [13]:
print(grad_x)

[[[-1.15467712e-01 -2.55054444e-01 -1.88274115e-01  1.77041069e-01
    1.09686041e+00  5.64804435e-01  4.01296496e-01  4.64555770e-01
    2.75689781e-01  4.22841758e-02  2.94088811e-01 -4.49973911e-01
    1.82049513e-01  5.32507598e-01  3.60694289e-01  3.10940426e-02
   -1.66649014e-01  5.02818823e-01  8.73779356e-02  6.77837431e-02
   -3.16725373e-01 -3.23223084e-01 -2.99189419e-01]
  [-3.34615380e-01  6.21533394e-02  6.43994510e-01 -7.49668598e-01
   -3.41789961e-01 -4.12406504e-01 -4.94804591e-01 -3.22321877e-02
   -6.93489835e-02  2.37267733e-01 -9.65248942e-01  1.27701938e+00
   -1.01153433e+00  5.64872503e-01 -8.01300526e-01  3.10140729e-01
    3.84574294e-01  7.54025042e-01 -1.13202834e+00 -1.07711577e+00
   -3.72776657e-01  8.16722393e-01  3.87704641e-01]
  [-1.55423582e-01 -8.75054419e-01  2.37540230e-01  8.01017106e-01
    6.52695417e-01 -6.99188471e-01 -8.84186745e-01 -7.93934703e-01
    8.48170400e-01 -2.91464806e-01 -9.94580269e-01  5.11115432e-01
    2.05940202e-01  1.476

In [4]:
for case in fc_cases:

    weight = case['weight']
    out_c, in_c = weight.shape
    bias = case['bias']
    x = case['x'].astype(np.float32)
    out = case['out']
    grad_output = case['grad_output']
    grad_x = case['grad_x']
    grad_w = case['grad_w']
    grad_b = case['grad_b']
    
    fc = FullyConnected(d_in=in_c, d_out=out_c)
    fc.W = weight
    fc.b = bias
    test_out = fc(x)
    dv_x, dv_W, dv_b = fc.backward(x, grad_output)
    
    #self.assertTrue(np.allclose(out, test_out, rtol=0.0001))

    print(np.allclose(grad_x, dv_x, rtol=0.001))
    print(np.allclose(grad_w, dv_W))
    print(np.allclose(grad_b, dv_b))
    print('\n\n')

True
True
True



True
True
True



True
True
True





In [14]:
for case in max_pool_cases:

    kernel = (case['kernel'], case['kernel'])
    stride = case['stride']
    pad = case['pad']
    x = case['x']

    out = case['out']
    grad_output = case['grad_output']
    grad_x = case['grad_x']

    max_pool = MaxPool2D(kernel_size=kernel, stride=stride, padding=pad)
    test_out = max_pool(x)
    dv_x = max_pool.backward(x, grad_output)
    
    print(np.allclose(grad_x, dv_x, rtol=0.0001), '\n\n')

True 


True 


True 




In [6]:
print(dv_x[0][0][:6])
print(dv_x[0][1][:6])
print(dv_x[0][2][:6])
print(dv_x[0][3][:6])
print(dv_x[0][4][:6])
print(dv_x[0][5][:6])

[ 0.14866087  0.          0.         -1.75294259  0.          0.        ]
[0. 0. 0. 0. 0. 0.]
[ 0.          0.          0.          0.         -0.55981082  0.        ]
[ 1.78742021  0.         -0.08431914  0.          0.         -2.06593116]
[-0.59099364  0.          0.          0.          0.          0.        ]
[ 0.         -1.67617209 -2.48737401  0.          0.          0.        ]


In [8]:
print(grad_output[0][0][:6])
print(grad_output[0][1][:6])
print(grad_output[0][2][:6])
print(grad_output[0][3][:6])
print(grad_output[0][4][:6])
print(grad_output[0][5][:6])

[ 0.14866087  0.40237263 -0.82060426 -1.334711    1.0564412   0.13685726]
[ 0.5318551   0.97044003 -1.5117408  -1.0416868  -0.937418    0.7219324 ]
[ 1.4502536   0.3266159  -0.1045112   0.09457997 -1.1984186   0.4902077 ]
[-0.1946885   0.17932318 -1.3813751  -1.1188244   0.33398527  0.90224385]
[-0.59099364 -1.9334221  -0.85108024 -0.7435231   1.1538576   0.01682013]
[ 0.24902421 -0.1710974  -1.6362938   1.0555542   0.32565504  0.9463995 ]


In [7]:
print(grad_x[0][0][:6])
print(grad_x[0][1][:6])
print(grad_x[0][2][:6])
print(grad_x[0][3][:6])
print(grad_x[0][4][:6])
print(grad_x[0][5][:6])

[ 0.14866087  0.          0.         -1.7529426   0.          0.        ]
[0. 0. 0. 0. 0. 0.]
[ 0.         0.         0.         0.        -0.5598108  0.       ]
[ 1.7874203   0.         -0.08431911  0.          0.         -2.065931  ]
[-0.59099364  0.          0.          0.          0.          0.        ]
[ 0.        -1.6761721 -2.487374   0.         0.         0.       ]


In [9]:
print(x[0][0][:6])
print(x[0][1][:6])
print(x[0][2][:6])
print(x[0][3][:6])
print(x[0][4][:6])
print(x[0][5][:6])

[ 1.769817   -1.1600094   0.39855713  2.5746279   0.2757979   0.61031055]
[-1.2590202  -0.02713936  0.04868459 -0.19278368  0.51870173 -1.8024259 ]
[-0.8047651  -0.8062368   0.16242757 -0.93263453  0.61033213 -0.00504397]
[ 1.6493801   0.24222896  0.25751153 -1.067929   -0.25932866  0.7788012 ]
[ 1.1260225  -0.74210197 -0.26923501 -1.8268266  -0.47851646 -0.6956416 ]
[-0.2544994   0.5801176   0.21740314 -1.3613878   0.16488637 -0.04285244]
