# MNIST

This notebook explores transfer learning techniques on MNIST.

## Change working directory to project root

In [None]:
import os
ROOT_DIRECTORIES = {'dogwood', 'tests'}
if set(os.listdir('.')).intersection(ROOT_DIRECTORIES) != ROOT_DIRECTORIES:
    os.chdir('../..')

## Exploration

In [None]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import h5py
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.initializers import Constant, GlorotUniform

In [None]:
MNIST_IMAGE_SHAPE = (28, 28)
MAX_PIXEL_VALUE = 255
MODEL_SAVE_DIR = '/tmp/dogwood/mnist'

In [None]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = tf.cast(X_train, tf.float32) / MAX_PIXEL_VALUE
X_test = tf.cast(X_test, tf.float32) / MAX_PIXEL_VALUE
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
model = Sequential([
    Flatten(input_shape=(MNIST_IMAGE_SHAPE), name='flatten'),
    Dense(128, activation='relu', name='dense_1'),
    Dense(10, activation='softmax', name='dense_2')
])
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32)

In [None]:
model.evaluate(X_test, y_test)

## Small model

We will train a smaller model so that it is easier to work with the weight matrices.

In [None]:
model = Sequential([
    Flatten(input_shape=(MNIST_IMAGE_SHAPE), name='flatten'),
    Dense(1, activation='relu', name='dense_1'),
    Dense(10, activation='softmax', name='dense_2')
])
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32)

In [None]:
model.evaluate(X_test, y_test)

## New models using (sometimes partially) pretrained weights

In [None]:
Path(MODEL_SAVE_DIR).mkdir(parents=True, exist_ok=True)
model_path = os.path.join(MODEL_SAVE_DIR, 'model.h5')
model.save_weights(model_path)

In [None]:
with h5py.File(model_path, 'r') as infile:
    print(infile.keys())

In [None]:
with h5py.File(model_path, 'r') as infile:
    print(infile['flatten'])

In [None]:
with h5py.File(model_path, 'r') as infile:
    print(infile['dense_1'])

In [None]:
with h5py.File(model_path, 'r') as infile:
    print(infile['dense_1'].keys())

In [None]:
with h5py.File(model_path, 'r') as infile:
    print(infile['dense_1']['dense_1'])

In [None]:
with h5py.File(model_path, 'r') as infile:
    print(infile['dense_1']['dense_1'].keys())

In [None]:
with h5py.File(model_path, 'r') as infile:
    print(infile['dense_1']['dense_1']['bias:0'])

In [None]:
with h5py.File(model_path, 'r') as infile:
    print(infile['dense_1']['dense_1']['kernel:0'])

In [None]:
with h5py.File(model_path, 'r') as infile:
    print(infile['dense_2']['dense_2']['kernel:0'])

In [None]:
with h5py.File(model_path, 'r') as infile:
    biases_1 = infile['dense_1']['dense_1']['bias:0'][:]
    weights_1 = infile['dense_1']['dense_1']['kernel:0'][:]

We can visualize these weights in a 2D plot, although they make more sense in 28 x 28.

In [None]:
%%script false --no-raise-error
# Cell temporarily disabled because not very interesting.
fig, ax = plt.subplots(figsize=(8, 8))
heatmap = ax.pcolor(weights_1.T, cmap=plt.cm.Blues)
cbar = ax.figure.colorbar(heatmap, ax=ax)
plt.show()

In [None]:
%%script false --no-raise-error
# Cell temporarily disabled because not very interesting.
fig, ax = plt.subplots(figsize=(8, 8))
heatmap = ax.pcolor(np.expand_dims(biases_1, axis=-1), cmap=plt.cm.Blues)
cbar = ax.figure.colorbar(heatmap, ax=ax)
plt.show()

In [None]:
%%script false --no-raise-error
# Cell temporarily disabled because not very interesting.
figs, axes = plt.subplots(ncols=2, figsize=(12, 8))
heatmap_weights = axes[0].pcolor(weights_1.T, cmap=plt.cm.Blues)
heatmap_biases = axes[1].pcolor(np.expand_dims(biases_1, axis=-1), cmap=plt.cm.Blues)
plt.show()

In [None]:
model_2 = Sequential([
    Flatten(input_shape=(MNIST_IMAGE_SHAPE), name='flatten'),
    Dense(1, activation='relu', name='dense_1'),
    Dense(10, activation='softmax', name='dense_2')
])
model_2.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['sparse_categorical_accuracy'])

In [None]:
model_2.load_weights(model_path, by_name=True)

In [None]:
model_2.evaluate(X_test, y_test)

In [None]:
model_2.predict(X_test)[0]

Identical loss and accuracy from the loaded weights when the architecture is identical. This is as expected. Now we will add a neuron.

In [None]:
model_3 = Sequential([
    Flatten(input_shape=(MNIST_IMAGE_SHAPE), name='flatten'),
    Dense(2, activation='relu', name='dense_1'),
    Dense(10, activation='softmax', name='dense_2')
])
model_3.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['sparse_categorical_accuracy'])

We will now artificially construct a weights file such that there is no performance decrease with the new architecture. See notes for how this is done. In brief, the weights and bias leading into the new neuron can be arbitrary, but the weights leading out must be 0. The bias unit in the new neuron's layer is unaffected.

In [None]:
model_3_path = os.path.join(MODEL_SAVE_DIR, 'model_3.h5')
model_2.save_weights(model_3_path)
with h5py.File(model_3_path, 'r+') as outfile:
    # Dense 1.
    group = outfile['dense_1']['dense_1']
    print(group['bias:0'])
    arr = np.concatenate((group['bias:0'][:], np.zeros((1,))), axis=0)
    print(arr.shape)
    del group['bias:0']
    group.create_dataset('bias:0', (2,), dtype='<f4', data=arr)
    print(group['bias:0'])
    print(group['kernel:0'])
    arr = np.concatenate((group['kernel:0'][:], np.zeros((784, 1))), axis=1)
    print(arr.shape)
    del group['kernel:0']
    group.create_dataset('kernel:0', (784, 2), dtype='<f4', data=arr)
    print(group['kernel:0'])
    # Dense 2.
    group = outfile['dense_2']['dense_2']
    print(group['kernel:0'])
    arr = np.concatenate((group['kernel:0'][:], np.zeros((1, 10))), axis=0)
    print(arr.shape)
    del group['kernel:0']
    group.create_dataset('kernel:0', (2, 10), dtype='<f4', data=arr)
    print(group['kernel:0'])

In [None]:
model_3.load_weights(model_3_path, by_name=True)

In [None]:
model_3.evaluate(X_test, y_test)

In [None]:
model_3.predict(X_test)[0]

We have successfully added a neuron and preserved loss and accuracy. Now let's train the new model.

In [None]:
model_3.fit(X_train, y_train, epochs=10, batch_size=32)

The model above uses zeros as the initialization for all weights. However, based on the gradient calculations from our research notes, we should only initialize weights from new nodes to old nodes to zero; all other weights should be randomly initialized.

In [None]:
glorot = GlorotUniform()

In [None]:
model_3_path = os.path.join(MODEL_SAVE_DIR, 'model_3.h5')
model_2.save_weights(model_3_path)
with h5py.File(model_3_path, 'r+') as outfile:
    # Dense 1.
    group = outfile['dense_1']['dense_1']
    print(group['bias:0'])
    # By default, dense layers use zeros as their bias initializers.
    arr = np.concatenate((group['bias:0'][:], np.zeros((1,))), axis=0)
    print(arr.shape)
    del group['bias:0']
    group.create_dataset('bias:0', (2,), dtype='<f4', data=arr)
    print(group['bias:0'])
    print(group['kernel:0'])
    # Initialize as if we want the full array; take only one column.
    initialization = glorot((784, 2))[:, :1]
    arr = np.concatenate((group['kernel:0'][:], initialization), axis=1)
    print(arr.shape)
    del group['kernel:0']
    group.create_dataset('kernel:0', (784, 2), dtype='<f4', data=arr)
    print(group['kernel:0'])
    # Dense 2.
    group = outfile['dense_2']['dense_2']
    print(group['kernel:0'])
    arr = np.concatenate((group['kernel:0'][:], np.zeros((1, 10))), axis=0)
    print(arr.shape)
    del group['kernel:0']
    group.create_dataset('kernel:0', (2, 10), dtype='<f4', data=arr)
    print(group['kernel:0'])

In [None]:
model_3.load_weights(model_3_path, by_name=True)

In [None]:
model_3.evaluate(X_test, y_test)

In [None]:
model_3.predict(X_test)[0]

As expected, the weight initialization scheme for the first dense layer weights did not affect the output because the new weights in the second dense layer are still 0. Now we will train the model.

In [None]:
model_3.fit(X_train, y_train, epochs=10, batch_size=32)

What if we trained it from scratch?

In [None]:
model_3 = Sequential([
    Flatten(input_shape=(MNIST_IMAGE_SHAPE), name='flatten'),
    Dense(2, activation='relu', name='dense_1'),
    Dense(10, activation='softmax', name='dense_2')
])
model_3.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['sparse_categorical_accuracy'])

In [None]:
model_3.fit(X_train, y_train, epochs=10, batch_size=32)

In this case, when intermediate weights are properly initialized, training from a partially pretrained neural network and training from scratch result in models of approximately equal performance. There is no clear advantage to either approach.

## Weight symmetry tests

Gradient descent forces us to randomly initialize neural network weights, otherwise we have the problem of weight symmetry. Weight symmetry causes all nodes to compute the same function, so your predictive power is dramatically decreased. Based on math done in the notes, we can get around this by randomly initializing the weights going into the new neurons, but we can still set the outgoing weights to zero, preserving performance.

In [None]:
model_4 = Sequential([
    Flatten(input_shape=(MNIST_IMAGE_SHAPE), name='flatten'),
    Dense(128, activation='relu', name='dense_1',
          kernel_initializer=Constant(0)),
    Dense(10, activation='softmax', name='dense_2',
          kernel_initializer=Constant(0))
])
model_4.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['sparse_categorical_accuracy'])

In [None]:
model_4.fit(X_train, y_train, epochs=10, batch_size=32)

In [None]:
model_5 = Sequential([
    Flatten(input_shape=(MNIST_IMAGE_SHAPE), name='flatten'),
    Dense(128, activation='relu', name='dense_1'),
    Dense(10, activation='softmax', name='dense_2',
          kernel_initializer=Constant(0))
])
model_5.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['sparse_categorical_accuracy'])

In [None]:
model_5.fit(X_train, y_train, epochs=10, batch_size=32)

In [None]:
model_6 = Sequential([
    Flatten(input_shape=(MNIST_IMAGE_SHAPE), name='flatten'),
    Dense(128, activation='relu', name='dense_1',
          kernel_initializer=Constant(0)),
    Dense(128, activation='relu', name='dense_1a',
          kernel_initializer=Constant(0)),
    Dense(10, activation='softmax', name='dense_2',
          kernel_initializer=Constant(0))
])
model_6.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['sparse_categorical_accuracy'])

In [None]:
model_6.fit(X_train, y_train, epochs=10, batch_size=32)

In [None]:
model_7 = Sequential([
    Flatten(input_shape=(MNIST_IMAGE_SHAPE), name='flatten'),
    Dense(128, activation='relu', name='dense_1'),
    Dense(128, activation='relu', name='dense_1a',
          kernel_initializer=Constant(0)),
    Dense(10, activation='softmax', name='dense_2',
          kernel_initializer=Constant(0))
])
model_7.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['sparse_categorical_accuracy'])

In [None]:
model_7.fit(X_train, y_train, epochs=10, batch_size=32)

In [None]:
model_8 = Sequential([
    Flatten(input_shape=(MNIST_IMAGE_SHAPE), name='flatten'),
    Dense(128, activation='relu', name='dense_1',
          kernel_initializer=Constant(0)),
    Dense(128, activation='relu', name='dense_1a'),
    Dense(10, activation='softmax', name='dense_2',
          kernel_initializer=Constant(0))
])
model_8.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['sparse_categorical_accuracy'])

In [None]:
model_8.fit(X_train, y_train, epochs=10, batch_size=32)

In [None]:
model_9 = Sequential([
    Flatten(input_shape=(MNIST_IMAGE_SHAPE), name='flatten'),
    Dense(128, activation='relu', name='dense_1',
          kernel_initializer=Constant(0)),
    Dense(128, activation='relu', name='dense_1a'),
    Dense(10, activation='softmax', name='dense_2')
])
model_9.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['sparse_categorical_accuracy'])

In [None]:
model_9.fit(X_train, y_train, epochs=10, batch_size=32)