# Løsningsforslag - Prosjekt 2: Dyp Læring

Både organiseringen av koden og omfanget av undersøkelsene av resultatene i denne LFen ligger på et litt høyere nivå enn det som er forventet av besvarelsene.

# Nettverket

Først implementerer vi selve nettverket. Klassen __Model__ inneholder parametrene som skal tilpasses samt metodene som skal til for å bruke nettverket på nye data etter at det er trent opp. Dette er altså alt som må lagres på harddisken for å kunne bruke nettverket senere. __Model__ svarer omtrent til kapittel 2.1 i oppgaveteksten.

In [1]:
import numpy as np


class Model():
    '''This class contains the parameters of a model.
    The variables W, b, w and μ are optimized by training.
    
    The class also contains the activation functions and methods for forward propagation,
    i.e. everything needed to use the model after it has been trained.
    '''
    def __init__(self, K, d, h):
        '''Allocate memory and set metaparameters.
        
        The parameters W, b, w, μ are initialized at random.
        '''
        self.K = K
        self.d = d
        self.h = h
        scaling_factor = np.sqrt(d)
        self.W  = np.random.randn(K,d,d) / scaling_factor
        self.b  = np.random.randn(K,d,1) / scaling_factor
        self.w  = np.random.randn(d,1) / scaling_factor
        self.μ = np.random.randn(1) / scaling_factor

    def forward(self, Y):
        '''Run the network in the forward direction, while storing data for backwards propagation.
        
        Y[0,:,:] is an I by d matrix of training data. The rest of Y is memory that will
        be used to store intermediate results needed by the back propagation.'''
        for k in range(self.K):
            Y[k+1,:,:] = Y[k,:,:] + self.h * self.σ(self.W[k,:,:] @ Y[k,:,:] + self.b[k,:])
        Z = self.𝜂(np.transpose(Y[self.K,:,:]) @ self.w + self.μ)
        return Z, Y

    def fast_forward(self, Y0):
        '''Run the network without preparing for back propagation.
        
        Y0 is a I by d matrix of I inputs of size d.'''
        Y_old = np.empty_like(Y0)
        Y_new = Y0.copy()
        for k in range(self.K):
            Y_old, Y_new = Y_new, Y_old
            Y_new[:,:] = Y_old + self.h * self.σ(self.W[k,:,:] @ Y_old + self.b[k,:])
        Z = self.𝜂(np.transpose(Y_new) @ self.w + self.μ)
        return Z

    def fast_landscape(self, Y):
        '''Used to visualize how the trained net separates points in the plane.'''
        Z = self.𝜂(np.transpose(Y) @ self.w + self.μ)
        return Z

    @staticmethod
    def σ(x):
        '''Activation function.
        
        To use another function, inherit this class and define a new one.
        Remember to also update dσ!'''
        return np.tanh(x)

    @staticmethod
    def dσ(x):
        '''Derivative of σ(x).'''
        return 1.0 - np.tanh(x) ** 2

    @staticmethod
    def 𝜂(x):
        '''Used instead of σ in the last step.'''
        return (1.0 + np.tanh(x / 2.0)) / 2.0

    @staticmethod
    def d𝜂(x):
        '''Derivative of 𝜂(x).'''
        return 0.25 * (1.0 - np.tanh(x / 2.0) ** 2)


Det neste vi gjør er å lage en egen klasse for dataene i gradienten. Dette gjør det mulig å unngå å allokere minne til dette i hvert skritt av treningen senere. Alle metodene er hjelpemetoder som sørger for at Adam descent-algoritmen kan skrives relativt konsist.

In [2]:
class Gradient():
    '''A boring class used to "lump" the gradient data together in one variable.
    
    Also good for speed, as the memory is preallocated.'''
    def __init__(self, K, d):
        '''
        K: Number of layers
        d: width of each layer.'''
        self.W = np.zeros((K,d,d))
        self.b = np.zeros((K,d,1))
        self.w = np.zeros((d,1))
        self.μ = np.zeros((1,))
    
    # The following methods are only needed for Adam descent.
    def squared(self):
        'Inplace'
        np.square(self.W, out=self.W)
        np.square(self.b, out=self.b)
        np.square(self.w, out=self.w)
        np.square(self.μ, out=self.μ)

    
    def sqrt(self):
        'Inplace'
        np.sqrt(self.W, out=self.W)
        np.sqrt(self.b, out=self.b)
        np.sqrt(self.w, out=self.w)
        np.sqrt(self.μ, out=self.μ)
    
    # The following methods are magic, just like __init__().
    # They are used for what is called "operator overloading".
    def __iadd__(self, other):
        # Inplace add. Two cases: float and grad.
        if isinstance(other, Gradient):
            self.W += other.W
            self.b += other.b
            self.w += other.w
            self.μ += other.μ
        else:
            self.W += other
            self.b += other
            self.w += other
            self.μ += other
        return self
    
    def __imul__(self, other):
        # Inplace multiplication. Other is float.
        self.W *= other
        self.b *= other
        self.w *= other
        self.μ *= other
        return self

    def __itruediv__(self, other):
        # Inplace division. Other is Gradient.
        self.W /= other.W
        self.b /= other.b
        self.w /= other.w
        self.μ /= other.μ
        return self

    def __rmul__(self, other):
        # Other is float.
        K, d, _ = self.b.shape
        result = Gradient(K, d)
        result.W[:] = other * self.W
        result.b[:] = other * self.b
        result.w[:] = other * self.w
        result.μ[:] = other * self.μ
        return result
    
    def __truediv__(self, other):
        # Other is float.
        K, d, _ = self.b.shape
        result = Gradient(K, d)
        result.W[:] = self.W / other
        result.b[:] = self.b / other
        result.w[:] = self.w / other
        result.μ[:] = self.μ / other
        return result


Klassen __BackPropagator__ implementerer formlene i avsnitt 2.4 i oppgaveteksten. Klassen brukes også til å unngå å allokere minne til visse variabler på nytt i hver iterasjon av treningen.

In [3]:
class BackPropagator():
    '''This class contains some pre-allocated memory,
    and methods doing the backwards propagation and calculation of the gradient.'''
    def __init__(self, K, d, batch_size):
        '''Allocationg memory.
        
        batch_size: The number of data points used to calculate the gradient. May be
        I or some smaller number. See "stochastic gradient descent" and "mini-batch".
        '''
        self.gradient = Gradient(K, d)
        self.P = np.empty((K+1,d,batch_size))
        self.batch_size = batch_size

    def compute_gradient(self, ZmC, Y, model):
        # Split into two functions for convenience.
        self._backwards_propagation(ZmC, Y, model)
        self._forward_computations(ZmC, Y, model)
        return self.gradient
        
    def _backwards_propagation(self, ZmC, Y, model):
        # The underscore before the name indicates that this method is 'private'.
        
        # Set right boundary value for backwards propagation.
        self.P[-1,:,:] = np.outer(model.w,
                            np.multiply(ZmC,
                                        model.d𝜂(np.transpose(Y[-1,:,:]) @ model.w + model.μ)
                                       )
                           )
        # Backwards propagation.
        for k in range(model.K,0,-1):
            self.P[k-1,:,:] = self.P[k,:,:] +\
                model.h * np.transpose(model.W[k-1,:,:]) @ np.multiply(
                  model.dσ(model.W[k-1,:,:] @ Y[k-1,:,:] + model.b[k-1,:]),
                  self.P[k,:,:])

    def _forward_computations(self, ZmC, Y, model):
        # Compute gradient of projection parameters w and μ.
        aa  = model.d𝜂(np.transpose(Y[-1,:,:]) @ model.w + model.μ)
        self.gradient.μ[:] = np.sum(ZmC * aa)
        self.gradient.w[:,:] = Y[-1,:,:] @ np.multiply(ZmC,aa)
        # Sweep through all layers for gradient calculation.
        for k in range(model.K):
            aa = model.h * np.multiply(
                self.P[k+1,:,:], model.dσ(model.W[k,:,:] @ Y[k,:,:] + model.b[k,:]))
            self.gradient.W[k,:,:] = aa @ np.transpose(Y[k,:,:])
            self.gradient.b[k,:,:] = aa @ np.ones((self.batch_size,1))


# Trening
I dette kapittelet implementerer vi selve treningen av det nevrale nettverket. Dette tilsvarer avsnitt 3.1 i oppgaveteksten. Deretter introduserer vi de mer sofistikerte treningsalgoritmene beskrevet i avsnitt 2.3 i oppgaveteksten.

Siden vi ønsker å kunne kjøre trening med alle kombinasjoner av plain vanilla gradient, Adams, og stochastic gradient descent, skriver vi først én klasse som beskriver den overordnede gangen i læringen. Deretter implementerer vi subklasser som legger til de ulike konkrete metodene.

*P.S.*

Legg merke til at alle parametrene til __init__()-metodene, med unntak av __model__, ikke er en del av selve modellen. Når nettverket først er trenkt, er det ikke behov for disse. Både treningsdata og optimeringsalgoritmer er altså ting man kan holde hemmelig for brukerne av nettverket.

In [4]:
class AbstractOptimizer():
    def __init__(self, model, training_data, training_labels, maxiter, **kwargs):
        '''Takes in the model (the neural net itself), the training set, and some metaparameters.'''
        self.model = model
        self.training_data = training_data
        self.training_labels = training_labels
        self.maxiter = maxiter
        self.d, self.training_set_size = training_data.shape


    def run(self):
        '''This method does the heavy lifting.
        
        NB! Can take some time.'''
        converged = False
        it = 0
        while not converged:
            it += 1
            self._iterate()
            converged = (it > self.maxiter)

    def _iterate(self):
        self.select_batch()
        Z, Y = self.model.forward(self.Y)    # First run a forward sweep to compute the classifier Z.
        ZmC = Z - self.C
        self._testing(ZmC)
        gradient = self.bp.compute_gradient(ZmC, Y, self.model)
        self._update_model(gradient)
        
    def _testing(self, ZmC_training):
        pass

    def _allocate_variables(self):
        self.C = np.empty((self.batch_size,1), dtype='bool')
        self.Y = np.empty((self.model.K+1,self.d,self.batch_size))
        self.bp = BackPropagator(self.model.K, self.d, self.batch_size)



Det er selvfølgelig en fare for over-tilpasning, så kryssvalidering er viktig. Følgende kode gjør det mulig å holde øye med hvordan det går i løpet av treningen:

In [None]:
class LoggingOptimizer(AbstractOptimizer):
    '''Class that stores some metircs during the optimization.'''
    def __init__(self, model, training_data, training_labels, maxiter, *, testing_data, testing_labels, **kwargs):
        super().__init__(model, training_data, training_labels, maxiter, **kwargs)
        self.testing_data = testing_data
        self.testing_labels = testing_labels
        self.testing_set_size = testing_data.shape[1]
        self.testing_residuals = []
        self.error_rates = []
        self.residuals = []

    def _testing(self, ZmC_training):
        # Residual of objective function
        residual = 0.5 / self.batch_size * np.linalg.norm(ZmC_training)**2
        self.residuals.append(residual)
        
        Z = self.model.fast_forward(self.testing_data)
        ZmC = Z - self.testing_labels
        residual = 0.5 / self.testing_set_size * np.linalg.norm(ZmC)**2
        self.testing_residuals.append(residual)
        
        failures = np.mean(np.abs(np.round(ZmC)))
        self.error_rates.append(failures)


import matplotlib.pyplot as plt


def plot_convergence(op):
    plt.semilogy(np.array(op.residuals), label='Training residual')
    plt.semilogy(np.array(op.testing_residuals), label='Testing residual')
    plt.semilogy(np.array(op.error_rates), label='Training error rate')
    plt.legend()
    plt.xlabel('Iterations')
    plt.show()


Først implementerer vi den enkleste optimeringen:

In [5]:
class NonBatchOptimizer(AbstractOptimizer):
    '''This class runs the learning. Tweaking the learning method is done by subclassing this class.'''
    def __init__(self, model, training_data, training_labels, maxiter, **kwargs):
        super().__init__(model, training_data, training_labels, maxiter, **kwargs)
        self.batch_size = self.training_set_size
        self._allocate_variables()
        self.C[:] = self.training_labels
        self.Y[0,:,:] = self.training_data

    def select_batch(self):
        pass


class PlainGradientOptimizer(AbstractOptimizer):
    def __init__(self, model, training_data, training_labels, maxiter, *, 𝜏, **kwargs):
        super().__init__(model, training_data, training_labels, maxiter, **kwargs)
        self.𝜏 = 𝜏

    def _update_model(self, gradient):
        '''Update the model, using plain vanilla gradient descent.'''
        self.model.W -= self.𝜏 * gradient.W
        self.model.b -= self.𝜏 * gradient.b
        self.model.w -= self.𝜏 * gradient.w
        self.model.μ -= self.𝜏 * gradient.μ


Når koden over fungerer, kan vi forsøke å implementere de mer sofistikerte algoritmene:

In [6]:
class BatchOptimizer(AbstractOptimizer):
    def __init__(self, model, training_data, training_labels, maxiter, *, batch_size, **kwargs):
        super().__init__(model, training_data, training_labels, maxiter, **kwargs)
        self.batch_size = batch_size
        self._allocate_variables()

    def select_batch(self):
        i = np.random.randint(self.training_set_size - self.batch_size + 1)
        self.Y[0,:,:] = self.training_data[:, i:i + self.batch_size]
        self.C[:] = self.training_labels[i:i + self.batch_size]


In [7]:
class AdamOptimizer(AbstractOptimizer):
    def __init__(self, model, training_data, training_labels, maxiter, **kwargs):
        super().__init__(model, training_data, training_labels, maxiter, **kwargs)
        self.m = Gradient(self.model.K, self.d)
        self.v = Gradient(self.model.K, self.d)
        self.j = 1
        
    def _update_model(self, gradient):
        '''Actual Adam descent method.
        
        Much of the work is done by operator overloading in the class Gradient.'''
        β1 = 0.9
        β2 = 0.999
        α = 0.01
        ε = 1e-8
        #
        self.m *= β1
        self.m += (1 - β1) * gradient
        m = self.m / (1 - β1**self.j)
        
        gradient.squared()
        self.v *= β2
        self.v += (1 - β2) * gradient
        v = self.v / (1 - β2**self.j)
        v.sqrt()
        v += ε
        
        m /= v
        m *= α

        self.model.W -= m.W
        self.model.b -= m.b
        self.model.w -= m.w
        self.model.μ -= m.μ

        self.j += 1


Og ved hjelp av magien som kalles "multiple inheritance", har vi plutselig tilgang på alle kombinasjonene av algoritmer:

In [8]:
# The simplest case.
class PlainNonBatch(PlainGradientOptimizer, NonBatchOptimizer, LoggingOptimizer):
    def __init__(self, model, training_data, training_labels, testing_data, testing_labels, maxiter, 𝜏):
        super().__init__(model, training_data, training_labels, maxiter, 𝜏=𝜏, testing_data=testing_data, testing_labels=testing_labels)

# Add the Adam descent algorithm.
class AdamNonBatch(AdamOptimizer, NonBatchOptimizer, LoggingOptimizer):
    def __init__(self, model, training_data, training_labels, testing_data, testing_labels, maxiter):
        super().__init__(model, training_data, training_labels, maxiter, testing_data=testing_data, testing_labels=testing_labels)

# Add mini-batches instead.
class PlainBatch(PlainGradientOptimizer, BatchOptimizer, LoggingOptimizer):
    def __init__(self, model, training_data, training_labels, testing_data, testing_labels, maxiter, 𝜏, batch_size):
        super().__init__(model, training_data, training_labels, maxiter, 𝜏=𝜏, batch_size=batch_size, testing_data=testing_data, testing_labels=testing_labels)

# Add both. IT JUST WORKS™.
class AdamBatch(AdamOptimizer, BatchOptimizer, LoggingOptimizer):
    def __init__(self, model, training_data, training_labels, testing_data, testing_labels, maxiter, batch_size):
        super().__init__(model, training_data, training_labels, maxiter, batch_size=batch_size, testing_data=testing_data, testing_labels=testing_labels)
