# Deep Learning

In this notebook, we move from single layer to networks with at least 2 layers. With this the numebr of neurons drammattically increases. Building on previous tutorial, we will program our own class to handle any number of layers. We will first train a very simple 2 layer network to solve the XOR problem and verify that we can target non linearly separable data sets. We then will train a larger 2 hidden layer network to solve the MNIST data set with an accuracy of more that 90%.

Run the notebook in Google colab:
https://colab.research.google.com/github/heprom/cvml/blob/main/tutorials/deep_learning.ipynb

## Quick Implementation of a two layer NN

Two layers with two inputs: $w^{(1)}$ has size (3, 3) including the bias and $w^{(2)}$ has size (3, 1) to give one output.

In [None]:
import numpy as np
from matplotlib import pyplot as plt, cm

X = np.array(...)  # include term for the bias trick
y = np.array(...)

np.random.seed(2)
w1 = np.random.randn(...)  # first layer
w2 = np.random.randn(...)  # second layer
print('* model params: {}, {}'.format(w1.tolist(), w2.tolist()))
eta = 1e-2  # learning rate
n_epochs = 10000

for t in range(n_epochs):
    # forward pass
    h = ...  # activation of the first layer
    y_pred = ...  # activation of the second layer
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # backprop
    grad_y_pred = ...
    grad_w2 = ...
    grad_h = ...
    grad_w1 = ...

    # update rule
    w1 -= ...
    w2 -= ...
print('* new model params: {}'.format(w1.tolist(), w2.tolist()))


print the results of the predictions with our trained model

In [None]:
for (xi, yi) in zip(X, y):
    h = 1 / (1 + np.exp(-xi.dot(w1)))
    out = ...
    y_pred = 1 if out > 0.5 else 0
    print('data={}, ground-truth={}, out={:.3f}, y={}'.format(xi, yi, out[0], y_pred))


## General Implementation

We modify our previous implementation of a Neural Network to include any number of layers. Additionnally, each layer can have any number of neurons.

In [None]:
class NeuralNetwork:
    """Artificial Neural Network class. 
    
    A general Feed-Forward Neural Network. Here, the activation function is a sigmoid, 
    the loss is computed using the squared error between the target and 
    the prediction. Learning the parameters is achieved using back-propagation 
    and gradient descent
    """
    
    def __init__(self, layers, eta=0.1, rand_seed=42):
        """Initialisation routine."""
        np.random.seed(rand_seed)
        self.W = []
        self.layers = layers  # keep a record of this
        # loop on the layers except for the last one
        for i in np.arange(...):
            w = np.random.randn(...)
            self.W.append(w)
        # the last layer does not need a bias
        w = np.random.randn(...)
        self.W.append(w)
        self.eta = eta  # learning rate
        self.loss_history = []
    
    def __repr__(self):
        """Simple string representation of the network."""
        return "NeuralNetwork: {}".format('-'.join(str(l) for l in self.layers))

    def sigmoid(self, x):
        """Our activation function."""
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_grad(self, x):
        """Gradient of the sigmoid function."""
        return self.sigmoid(x) * (1 - self.sigmoid(x))
    
    def predict(self, X, bias_trick=True):
        """Compute the output of the network for the input X. 
        Notice that out value of bounded between 0 and 1."""
        p = np.atleast_2d(X)
        if bias_trick:
            # bias trick
            p = np.c_[np.ones((p.shape[0])), p]
        for layer in np.arange(0, len(self.layers) - 1):
            p = self.sigmoid(np.dot(p, self.W[layer]))
        return p
    
    def loss(self, X, y, bias_trick=False):
        """Compute the squared error loss for a given set of inputs."""
        y = np.atleast_2d(y)
        y_pred = self.predict(X, bias_trick=bias_trick)
        loss = np.sum((y_pred - y) ** 2)
        return loss
        
    def back_propagation(self, X, y):
        """Conduct backpropagation to update the weights."""
        # we need to keep a list of the activation of each layer
        A = [np.atleast_2d(X)]
        
        # forward pass
        for layer in np.arange(...):
            a = ....
            A.append(...)
        
        # backprop phase
        D = [(A[-1] - y) * A[-1] * (1 - A[-1])]
        for layer in np.arange(...):
            delta = ...
            delta = ...
            D.append(delta)
        D = D[::-1]  # reverse the order
        
        # update weights
        for layer in np.arange(0, len(self.layers) - 1):
            grad_W = ...
            self.W[layer] -= ...
        
    def fit(self, X, y, n_epochs=10, method='batch', display_update=100):
        """Perform gradient descent on a given number of epochs to update the weights."""
        # bias trick: add a column of 1 to X
        X = np.c_[np.ones((X.shape[0])), X]
        for i_epoch in range(n_epochs):
            if method == 'batch':
                # perform backprop on the whole training set (batch)
                self.back_propagation(X, y)
            else:
                # here we update the weight for every data point (SGD)
                for (xi, yi) in zip(X, y):
                    self.back_propagation(xi, yi)
            # an epoch has passed, compute the loss
            loss = self.loss(X, y)
            self.loss_history.append(loss)
            if i_epoch == 0 or (1 + i_epoch) % display_update == 0:
                print("epoch={}, loss={:.3f}".format(1 + i_epoch, loss))


## XOR problem

In [None]:
# define our 2-2-1 neural network and train it
np.random.seed(42)
nn = NeuralNetwork(layers=[...], eta=0.5)

In [None]:
# test our __repr__function
print(nn)

In [None]:
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])
print('W shape: {}'.format([Wi.shape for Wi in nn.W]))
print('initial weights:', nn.W)
nn.fit(..., ..., n_epochs=1000, method='SGD')

In [None]:
for (xi, yi) in zip(X, y):
    out = ...
    y_pred = 1 if out > 0.5 else 0
    print('data={}, ground-truth={}, out={:.3f}, y={}'.format(xi, yi, out[0][0], y_pred))


In [None]:
plt.figure(figsize=(4.5, 4))
plt.plot(..., linewidth=3)
plt.xlabel('Epoch', fontsize=14)
plt.ylabel('Loss', fontsize=14)
plt.subplots_adjust(left=0.15, top=0.95)
plt.savefig('XOR_deep_learning_loss.pdf')
plt.show()

## Solving the MNIST data set

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

digits = datasets.load_digits()

In [None]:
digits.keys()

In [None]:
print(digits.DESCR)

In [None]:
print(digits['images'].shape)
print(digits['images'].dtype)
print(digits['images'].min())
print(digits['images'].max())
print(digits['data'].shape)
print(digits['data'].dtype)

index = 34
print(digits['data'][index, :].reshape((8, 8)))
np.sum(digits['data'][index, :] == digits['images'][index].ravel())

plot the image with index `index`

In [None]:
plt.imshow(..., cmap=cm.gray_r)
plt.title('ground-truth: %d' % digits.target[index])
plt.show()

have a look at the first few images

In [None]:
n_images = 20
fig, axes = plt.subplots(1, n_images)
for i in range(n_images):
    ax = axes[i]
    ax.set_axis_off()
    ax.imshow(digits['images'][i], cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('%i' % digits['target'][i])

In [None]:
# split the data into a training and a testing set
(X_train, X_test, y_train, y_test) = train_test_split(digits['data'] / 16.0, digits['target'], test_size=0.25, random_state=13)
print(X_train.shape)
print('first 10 train labels: {}'.format(y_train[:10]))
print('first 10 test labels: {}'.format(y_test[:10]))

In [None]:
# verify training images
fig, axes = plt.subplots(1, n_images)
for i in range(n_images):
    ax = axes[i]
    ax.set_axis_off()
    ax.imshow(X_train[i].reshape((8, 8)), cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('%i' % y_train[i])

In [None]:
# plot training images
start_index = 10
plt.figure(figsize=(15, 9))
for i, index in enumerate(range(start_index, start_index + 180)):
    ax = plt.subplot(9, 20, i + 1)
    ax.imshow(X_train[index, :].reshape((8, 8)), cmap=cm.gray_r)
    plt.axis('off')
plt.axis('tight')
plt.show()


With our network implementation, the output of the network is bounded between 0 and 1 (output of the sigmoidal function). We could have implemented a multinomial SVM classifier but instead we will use autoencoders to keep it very general.

Each digit is transformed into a 10 component vector of zero axecpt for a 1 at the position of the digit. For example 6 become [0, 0, 0, 0, 0, 1, 0, 0, 0]. This allows to create a 10 output network and to compare each output with the transformed version of each label.

We use `scikit-learn` utility `LabelBinarizer` for this.

In [None]:
from sklearn.preprocessing import LabelBinarizer

# convert the labels from integers to vectors
y_train = LabelBinarizer().fit_transform(y_train)
y_test = LabelBinarizer().fit_transform(y_test)
print(y_train.shape)
for i in range(10):
    print('{}: {}'.format(y_train[i].argmax(), y_train[i]))

create our neural network with 4 layers of 64, 32, 16 and finally 10 neurons.

In [None]:
np.random.seed(42)
nn = NeuralNetwork(...)
print(nn)
print("learning rate {}".format(nn.eta))

nn.fit(..., ..., n_epochs=100, method='SGD', display_update=10)

Plot the loss saved during training

In [None]:
plt.figure(figsize=(4.5, 4))
plt.plot(..., linewidth=3)
plt.xlabel('Epoch', fontsize=14)
plt.ylabel('Loss', fontsize=14)
plt.subplots_adjust(left=0.15, top=0.95)
plt.show()

In [None]:
# have a look at a particular data point from the test set
index = 11
y_pred = nn.predict(X_test[index, :])
y_class = ...
y = ...
print('test image {}: predict a {} - ground truth is a {}'.format(index, y_class, y))

In [None]:
import matplotlib.patches as patches

start_index = 10
plt.figure(figsize=(12, 9))
for i, index in enumerate(range(start_index, start_index + 60)):
    y_pred = nn.predict(X_test[index, :])
    y_class = np.argmax(y_pred)
    y = np.argmax(y_test[index])
    if y_class == y:
        color = 'green'
    else:
        color = 'red'
    ax = plt.subplot(6, 10, i + 1)
    ax.imshow(X_test[index, :].reshape((8, 8)), cmap=cm.gray_r)
    rect = patches.Rectangle((-0.5, -0.5), 8.0, 8.0, linewidth=8, edgecolor=color, facecolor='none')
    ax.add_patch(rect)
    #plt.title('seen as {}'.format(y_class))
    plt.axis('off')
plt.subplots_adjust(wspace=0.1)
plt.show()

In [None]:
from sklearn.metrics import classification_report

predictions = nn.predict(X_test)
y_test_pred = predictions.argmax(axis=1)
print(classification_report(y_test.argmax(axis=1), y_test_pred))


In [None]:
from sklearn.metrics import confusion_matrix
predictions = ...
y_train_pred = predictions.argmax(axis=1)
conf = confusion_matrix(y_train.argmax(axis=1), y_train_pred)

In [None]:
plt.figure(figsize=(5, 5))
plt.imshow(...)
plt.xlabel('predicted class')
plt.ylabel('actual class')
plt.title('confusion matrix')
plt.savefig('mnist_deep_learning_confusion_matrix.pdf')
plt.show()

## Using tansorflow+Keras

Keras is a high level programming interface to build deep Neural Networks. It need a backend to run (to actually build te computational graph, compute the gradients and so on).

Note that starting from version 2.3.0, Keras is now part of TensorFlow (which is now the default and only backend).

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Activation
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import backend as K

### Using Sigmoid layers

In [None]:
# define the 64-32-16-10 architecture using Keras
model = Sequential()
model.add(Dense(..., input_shape=(64,), activation='sigmoid'))
model.add(Dense(..., activation='sigmoid'))
model.add(Dense(..., activation='softmax'))

In [None]:
sgd = SGD(0.01)
model.compile(
    loss='categorical_crossentropy', 
    optimizer=sgd, 
    metrics=['accuracy'])

Train the model on 300 epochs

In [None]:
print('training network...')
H = model.fit(
    ..., 
    ..., 
    validation_data=(X_test, y_test), 
    epochs=..., 
    verbose=1,
    shuffle=True,
    batch_size=10)

Make a plot of the training/validation loss and of the training/validation accuracy.

In [None]:
plt.figure()
plt.axhline(y=1., color='gray', linestyle='dashed')
plt.plot(H.history["loss"][:300], label="training loss")
plt.plot(H.history["val_loss"][:300], label="validation loss")
plt.plot(H.history["accuracy"][:300], label="training accuracy")
plt.plot(H.history["val_accuracy"][:300], label="validation accuracy")
plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Loss/Accuracy")
plt.xlim(0, 300)
plt.legend()
plt.savefig('Keras_mnist.pdf')
plt.show()

### Using ReLU layers

In practice, you should always try ReLU over Sigmoid as it as proven to provide a better learning ability to the network (help with vanishing gradients).

In [None]:
model_relu = Sequential()
model_relu.add(Dense(32, input_shape=(64, ), activation='relu'))
model_relu.add(Dense(16, activation='relu'))
model_relu.add(Dense(10, activation='softmax'))

In [None]:
model_relu.summary()

In [None]:
sgd = SGD(0.01)
model_relu.compile(
    loss='categorical_crossentropy', 
    optimizer=sgd, 
    metrics=['accuracy'])

In [None]:
X_train.shape

In [None]:
H = model_relu.fit(
    X_train, 
    y_train, 
    epochs=300, 
    validation_split=0.1, 
    verbose=1,
    shuffle=True,
    batch_size=16)

In [None]:
loss = H.history['loss']
val_loss = H.history['val_loss']
epochs = range(1, len(loss) + 1)
fig = plt.figure(figsize=(12, 5))
ax1 = plt.subplot(1, 2, 1)
ax1.plot(epochs, loss, label='Training loss')
ax1.plot(epochs, val_loss, label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
ax2 = plt.subplot(1, 2, 2)
acc = H.history['accuracy']
val_acc = H.history['val_accuracy']
ax2.plot(epochs, acc, label='Training acc')
ax2.plot(epochs, val_acc, label='Validation acc')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.subplots_adjust(wspace=0.3)
plt.show()