In [89]:
import numpy as np
import random

In [90]:
class Conv3x3:
  def __init__(self, num_filters):
    self.num_filters = num_filters

    # filters is a 3d array with dimensions (num_filters, 3, 3)
    # We divide by 9 to reduce the variance of our initial values
    self.filters = np.random.randn(num_filters, 3, 3) / 9

  def iterate_regions(self, image):
    '''
    Generates all possible 3x3 image regions using valid padding.
    - image is a 2d numpy array
    '''
    h, w = image.shape

    for i in range(h - 2):
      for j in range(w - 2):
        im_region = image[i:(i + 3), j:(j + 3)]
        yield im_region, i, j

  def forward(self, input):
    '''
    Performs a forward pass of the conv layer using the given input.
    Returns a 3d numpy array with dimensions (h, w, num_filters).
    - input is a 2d numpy array
    '''
    h, w = input.shape
    output = np.zeros((h - 2, w - 2, self.num_filters))

    for im_region, i, j in self.iterate_regions(input):
        # print(im_region * self.filters)
        # if i == 1 and j == 1:
            # print(im_region)
            # print(self.filters)
            # print(im_region*self.filters)
        output[i, j] = np.sum(im_region * self.filters, axis=(1, 2))
        # print(output[i,j])

    return output

In [91]:
from tensorflow.keras import datasets, layers, models
import keras
import tensorflow as tf

In [92]:
((train_data, train_labels),
 (test_data, test_labels)) = tf.keras.datasets.fashion_mnist.load_data()

In [93]:
conv = Conv3x3(8)
output = conv.forward(train_data[0])
print(conv.filters)
print(output.shape)

[[[-0.16579106  0.04157581  0.18261762]
  [ 0.02589571  0.04870474 -0.04001759]
  [ 0.03043945 -0.00339932  0.06048421]]

 [[-0.05520445 -0.00963506 -0.07474203]
  [-0.06806303  0.01596686  0.03544576]
  [-0.0733058   0.08596688  0.05192666]]

 [[ 0.18306948  0.24292954  0.00098075]
  [ 0.21271711  0.13813746  0.04442928]
  [-0.05206455 -0.05296049  0.02648289]]

 [[ 0.02287851 -0.12688691 -0.1565667 ]
  [ 0.16806256  0.09273228  0.02998046]
  [-0.04214815  0.16788204 -0.1187797 ]]

 [[ 0.12209162 -0.13940288 -0.08098008]
  [-0.12857576  0.07461258 -0.00906119]
  [-0.09104775 -0.06795708 -0.14713846]]

 [[-0.01371948 -0.08388236  0.02789888]
  [ 0.20024302  0.08422004 -0.12890142]
  [ 0.01407955 -0.06800342  0.03944906]]

 [[-0.21172107 -0.01246743  0.17083178]
  [ 0.01787524 -0.02465381 -0.05946606]
  [-0.2502538  -0.13698581  0.17217904]]

 [[-0.1046654   0.04590083 -0.18127095]
  [-0.09283195  0.02560757  0.14189243]
  [ 0.36817788 -0.07548666 -0.05800406]]]
(26, 26, 8)


## implement pooling

In [94]:
class MaxPool2:
    def iterate_regions(self,image):
       h,w,_ = image.shape
       new_h = h//2
       new_w = w//2

       for i in range(0,w,2):
           for j in range(0,h,2):
               im_region = image[i:(i+2),j:(j+2)]
               yield im_region, i//2 ,j//2

    def forward(self, input):
        h,w,num_filters = input.shape
        output = np.zeros((h//2,w//2,num_filters))

        for im_region, i, j in self.iterate_regions(input):
            output[i,j] = np.amax(im_region, axis=(0,1))

        return output

In [95]:
pool = MaxPool2()
output = pool.forward(output)
print(output.shape)

(13, 13, 8)


## cross-entropy loss and softmax

What softmax really does is help us quantify how sure we are of our prediction, which is useful when training and evaluating our CNN. More specifically, using softmax lets us use cross-entropy loss, which takes into account how sure we are of each prediction. Here’s how we calculate cross-entropy loss:

L = -ln(p_c)

where c is the correct class (in our case, the correct digit), p_c is the predicted probability for class c, and ln is the natural log. As always, a lower loss is better. For example, in the best case, we’d have

p_c = 1, L = -ln(1) = 0

In a more realistic case, we might have

p_c = 0.8, L = -ln(0.8) = 0.223

In [96]:
class Softmax:
  # A standard fully-connected layer with softmax activation.
  # nodes number of labels/classes, input_len: number of elements for output of pooling layer
  def __init__(self, input_len, nodes):
    # We divide by input_len to reduce the variance of our initial values
    self.weights = np.random.randn(input_len, nodes) / input_len
    self.biases = np.zeros(nodes)

  def forward(self, input):
    '''
    Performs a forward pass of the softmax layer using the given input.
    Returns a 1d numpy array containing the respective probability values.
    - input can be any array with any dimensions.
    '''
    input = input.flatten()

    input_len, nodes = self.weights.shape

    total = np.dot(input,self.weights) + self.biases
    # print(input.shape,self.weights.shape,total.shape)

    total = np.exp(total)
    return total / np.sum(total)

In [97]:
softmax = Softmax(13*13*8,10)
out = softmax.forward(output)
print(out)

[0.02143594 0.00228761 0.25976688 0.26191097 0.18616113 0.04974702
 0.01968595 0.02979915 0.0050981  0.16410727]


## Model Initialization

In [98]:
import mnist
# We only use the first 1k testing examples (out of 10k total)
# in the interest of time. Feel free to change this if you want.
test_images = mnist.test_images()[:1000]
test_labels = mnist.test_labels()[:1000]

conv = Conv3x3(8)                  # 28x28x1 -> 26x26x8
pool = MaxPool2()                  # 26x26x8 -> 13x13x8
softmax = Softmax(13 * 13 * 8, 10) # 13x13x8 -> 10

def forward(image, label):
  '''
  Completes a forward pass of the CNN and calculates the accuracy and
  cross-entropy loss.
  - image is a 2d numpy array
  - label is a digit
  '''
  # We transform the image from [0, 255] to [-0.5, 0.5] to make it easier
  # to work with. This is standard practice.
  out = conv.forward((image / 255) - 0.5)
  out = pool.forward(out)
  out = softmax.forward(out)

  # Calculate cross-entropy loss and accuracy. np.log() is the natural log.
  loss = -np.log(out[label])
  acc = 1 if np.argmax(out) == label else 0

  return out, loss, acc

print('MNIST CNN initialized!')

loss = 0
num_correct = 0
for i, (im, label) in enumerate(zip(test_images, test_labels)):
  # Do a forward pass.
  _, l, acc = forward(im, label)
  loss += l
  num_correct += acc

  # Print stats every 100 steps.
  if i % 100 == 99:
    print(
      '[Step %d] Past 100 steps: Average Loss %.3f | Accuracy: %d%%' %
      (i + 1, loss / 100, num_correct)
    )
    loss = 0
    num_correct = 0

MNIST CNN initialized!
[Step 100] Past 100 steps: Average Loss 2.303 | Accuracy: 14%
[Step 200] Past 100 steps: Average Loss 2.303 | Accuracy: 15%
[Step 300] Past 100 steps: Average Loss 2.304 | Accuracy: 10%
[Step 400] Past 100 steps: Average Loss 2.303 | Accuracy: 10%
[Step 500] Past 100 steps: Average Loss 2.303 | Accuracy: 8%
[Step 600] Past 100 steps: Average Loss 2.302 | Accuracy: 11%
[Step 700] Past 100 steps: Average Loss 2.303 | Accuracy: 6%
[Step 800] Past 100 steps: Average Loss 2.302 | Accuracy: 16%
[Step 900] Past 100 steps: Average Loss 2.303 | Accuracy: 10%
[Step 1000] Past 100 steps: Average Loss 2.303 | Accuracy: 12%


## backprop

### input to the softmax layer's backward phase

In [100]:
#calculate initial gradient
nodes = 10
gradient = np.zeros(nodes)
gradient[label] = -1 / out[label]
print(label)

9


### prepare caching during the forward phase

In [103]:
class Softmax:
  def __init__(self, input_len, nodes):
    self.weights = np.random.randn(input_len, nodes) / input_len
    self.biases = np.zeros(nodes)

  def forward(self, input):
    self.last_input_shape = input.shape
    input = input.flatten()
    self.last_input = input

    input_len, nodes = self.weights.shape

    total = np.dot(input,self.weights) + self.biases
    self.last_total = total

    total = np.exp(total)
    return total / np.sum(total)
  def backprop(self, d_L_d_out, learn_rate):
    '''
    Performs a backward pass of the softmax layer.
    Returns the loss gradient for this layer's inputs.
    - d_L_d_out is the loss gradient for this layer's outputs.
    - learn_rate is a float    '''
    # We know only 1 element of d_L_d_out will be nonzero
    for i, gradient in enumerate(d_L_d_out):
      if gradient == 0:
        continue

      # e^totals
      t_exp = np.exp(self.last_totals)

      # Sum of all e^totals
      S = np.sum(t_exp)

      # Gradients of out[i] against totals
      d_out_d_t = -t_exp[i] * t_exp / (S ** 2)
      d_out_d_t[i] = t_exp[i] * (S - t_exp[i]) / (S ** 2)

      # Gradients of totals against weights/biases/input
      d_t_d_w = self.last_input
      d_t_d_b = 1
      d_t_d_inputs = self.weights

      # Gradients of loss against totals
      d_L_d_t = gradient * d_out_d_t

      # Gradients of loss against weights/biases/input
      d_L_d_w = d_t_d_w[np.newaxis].T @ d_L_d_t[np.newaxis]
      d_L_d_b = d_L_d_t * d_t_d_b
      d_L_d_inputs = d_t_d_inputs @ d_L_d_t

      # Update weights / biases
      self.weights -= learn_rate * d_L_d_w
      self.biases -= learn_rate * d_L_d_b
      return d_L_d_inputs.reshape(self.last_input_shape)