# Single layer Neural Network

In this notebook, we will code a single neuron and use it as a linear classifier with two inputs. The tuning of the neuron parameters is done by backpropagation using gradient descent.

In [None]:
from sklearn.datasets import make_blobs
import numpy as np

# matplotlib to display the data
import matplotlib
matplotlib.rc('font', size=16)
matplotlib.rc('xtick', labelsize=16) 
matplotlib.rc('ytick', labelsize=16) 
from matplotlib import pyplot as plt, cm
from matplotlib.colors import ListedColormap
%matplotlib inline

## Dataset

Let's create some labeled data in the form of (X, y) with an associated class which can be 0 or 1. For this we can use the function `make_blobs` in the `sklearn.datasets` module. Here we use 2 centers with coordinates (-0.5, -1.0) and (1.0, 1.0).

In [None]:
X, y = make_blobs(n_features=..., random_state=42, centers=[...])
y = y.reshape((y.shape[0], 1))
print(X.shape)
print(y.shape)

Plot our training data using `plt.scatter` to have a first visualization. Here we color the points with their labels stored in `y`.

In [None]:
plt.scatter(..., ..., c=y.squeeze(), edgecolors='gray')
plt.title('training data with labels')
plt.axis('equal')
plt.show()

## Activation functions

Here we play with popular activation functions like heaviside (step function), tanh, ReLu or sigmoid.

In [None]:
def heaviside(x):
    return ...

def sigmoid(x):
    return ...

def ReLU(x):
    return ...

def leaky_ReLU(x, alpha=0.1):
    return ...

def tanh(x):
    return ...

Make a plot with all our activation functions

In [None]:
from math import pi

plt.figure()
x = np.arange(-pi, pi, 0.01)
plt.axhline(y=0., color='gray', linestyle='dashed')
plt.axhline(y=-1, color='gray', linestyle='dashed')
plt.axhline(y=1., color='gray', linestyle='dashed')
plt.axvline(x=0., color='gray', linestyle='dashed')

plt.xlim(-pi, pi)
plt.ylim(-1.2, 1.2)
plt.title('activation functions', fontsize=16)
plt.plot(x, ..., label='heavyside', linewidth=3)
plt.plot(x, ..., label='sigmoid', linewidth=3)
plt.plot(x, ..., label='tanh', linewidth=3)
plt.plot(x, ..., label='ReLU', linewidth=3)
plt.plot(x, ..., label='leaky ReLU', linewidth=3)
plt.legend(loc='lower right')
plt.show()

In [None]:
# gradients of the activation functions
def sigmoid_grad(x):
    s = sigmoid(x)
    return ...

def relu_grad(x):
    return ...

def tanh_grad(x):
    return ....

Plot the gradients of the activation functions

In [None]:
plt.figure()
x = np.arange(-pi, pi, 0.01)
plt.plot(x, sigmoid_grad(x), label='sigmoid gradient', linewidth=3)
plt.plot(x, relu_grad(x), label='ReLU gradient', linewidth=3)
plt.plot(x, tanh_grad(x), label='tanh gradient', linewidth=3)
plt.xlim(-pi, pi)
plt.title('activation function derivatives', fontsize=16)
legend = plt.legend()
legend.get_frame().set_linewidth(2)
plt.savefig('activation_functions_derivatives.pdf')
plt.show()

## ANN implementation

A simple neuron with two inputs $(x_1, x_2)$ which applies an affine transform of weigths $(w_1, w_2)$ and bias $w_0$.

The neuron compute the quantity called activation $a=\sum_i w_i x_i + w_0 = w_0 + w_1 x_1 + w_2 x_2$

This quantity is send to the activation function chosen to be a sigmoid function here: $f(a)=\dfrac{1}{1+e^{-a}}$

$f(a)$ is the output of the neuron bounded between 0 and 1.

### Quick implementation

First let's implement our network in a concise fashion.

In [None]:
import numpy as np
from numpy.random import randn

X, y = make_blobs(n_samples= 100, n_features=2, random_state=42, centers=[[-0.5, -1], [1, 1]])
# adjust the sizes of our arrays
X = ...
y = ...

np.random.seed(2)
W = randn(...)
print('* model params: {}'.format(W.tolist()))
eta = 1e-2  # learning rate
n_epochs = 50

for t in range(n_epochs):
    # forward pass
    y_pred = ...
    loss = ...
    print(t, loss)

    # backprop
    grad_y_pred = ...
    grad_W = ...

    # update rule
    W -= ...
print('* new model params: {}'.format(W.tolist()))


### Modular implementation

Now let's create a class to represent our neural network to have more flexibility and modularity. This will prove to be useful later when we add more layers.

In [None]:
class SingleLayerNeuralNetwork:
    """A simple artificial neuron with a single layer and two inputs. 
    
    This type of network is called a Single Layer Neural Network and belongs to 
    the Feed-Forward Neural Networks. Here, the activation function is a sigmoid, 
    the loss is computed using the squared error between the target and 
    the prediction. Learning the parameters is achieved using back-propagation 
    and gradient descent
    """
    
    def __init__(self, eta=0.01, rand_seed=42):
        """Initialisation routine."""
        np.random.seed(rand_seed)
        self.W = ...  # weigths
        self.eta = eta  # learning rate
        self.loss_history = []
    
    def sigmoid(self, x):
        """Our activation function."""
        return ...
    
    def sigmoid_grad(self, x):
        """Gradient of the sigmoid function."""
        return ...
    
    def predict(self, X, bias_trick=True):
        X = np.atleast_2d(X)
        if bias_trick:
            # bias trick: add a column of 1 to X
            X = np.c_[np.ones((X.shape[0])), X]
        return ...
    
    def loss(self, X, y, bias_trick=False):
        """Compute the squared error loss for a given set of inputs."""
        y_pred = self.predict(X, bias_trick=bias_trick)
        y_pred = y_pred.reshape((y_pred.shape[0], 1))
        loss = ...
        return loss
        
    def back_propagation(self, X, y):
        """Conduct backpropagation to update the weights."""
        X = np.atleast_2d(X)
        y_pred = self.sigmoid(...).reshape((X.shape[0], 1))
        grad_y_pred = ...
        grad_W = ...

        # update weights
        self.W -= ...
        
    def fit(self, X, y, n_epochs=10, method='batch', save_fig=False):
        """Perform gradient descent on a given number of epochs to update the weights."""
        # bias trick: add a column of 1 to X
        X = np.c_[np.ones((X.shape[0])), X]
        self.loss_history.append(self.loss(X, y))  # initial loss
        for i_epoch in range(n_epochs):
            if method == 'batch':
                # perform backprop on the whole training set (batch)
                self.back_propagation(X, y)
                # weights were updated, compute the loss
                loss = self.loss(X, y)
                self.loss_history.append(loss)
                print(i_epoch, self.loss_history[-1])
            else:
                # here we update the weight for every data point (SGD)
                for (xi, yi) in zip(X, y):
                    self.back_propagation(xi, yi)
                    # weights were updated, compute the loss
                    loss = self.loss(X, y)
                    self.loss_history.append(loss)
            if save_fig:
                self.plot_model(i_epoch, save=True, display=False)

    def decision_boundary(self, x):
        """Return the decision boundary in 2D."""
        return ...
    
    def plot_model(self, i_epoch=-1, save=False, display=True):
        """Build a figure to vizualise how the model perform."""
        xx0, xx1 = np.arange(-3, 3.1, 0.1), np.arange(-3, 4.1, 0.1)
        XX0, XX1 = np.meshgrid(xx0, xx1)
        # apply the model to the grid
        y_an = np.empty(len(XX0.ravel()))
        i = 0
        for (x0, x1) in zip(XX0.ravel(), XX1.ravel()):
            y_an[i] = self.predict(np.array([x0, x1]))
            i += 1
        y_an = y_an.reshape((len(xx1), len(xx0)))
        figure = plt.figure(figsize=(12, 4))
        ax1 = plt.subplot(1, 3, 1)
        #ax1.set_title(r'$w_0=%.3f$, $w_1=%.3f$, $w_2=%.3f$' % (self.W[0], self.W[1], self.W[2]))
        ax1.set_title("current prediction")
        ax1.contourf(XX0, XX1, y_an, alpha=.5)
        ax1.scatter(X[:, 0], X[:, 1], c=y.squeeze(), edgecolors='gray')
        ax1.set_xlim(-3, 3)
        ax1.set_ylim(-3, 4)
        print(ax1.get_xlim())
        x = np.array(ax1.get_xlim())
        ax1.plot(x, self.decision_boundary(x), 'k-', linewidth=2)
        ax2 = plt.subplot(1, 3, 2)
        x = np.arange(3)  # the label locations
        rects1 = ax2.bar(x, [self.W[0, 0], self.W[1, 0], self.W[2, 0]])
        ax2.set_title('model parameters')
        ax2.set_xticks(x)
        ax2.set_xticklabels([r'$w_0$', r'$w_1$', r'$w_2$'])
        ax2.set_ylim(-1, 2)
        ax2.set_yticks([0, 2])
        ax2.axhline(xmin=0, xmax=2)
        ax3 = plt.subplot(1, 3, 3)
        ax3.plot(self.loss_history, c='lightgray', lw=2)
        if i_epoch < 0:
            i_epoch = len(self.loss_history) - 1
        ax3.plot(i_epoch, self.loss_history[i_epoch], 'o')
        ax3.set_title('loss evolution')
        ax3.set_yticks([])
        plt.subplots_adjust(left=0.05, right=0.98)
        if save:
            plt.savefig('an_%02d.png' % i_epoch)
        if display:
            plt.show()
        plt.close()


### Train our model on the data set

Create two blobs with $n=1000$ data points.

Instantiate the model with $\eta$=0.1 and a random seed of 2.

Train the model using the batch gradient descent on 20 epochs.

In [None]:
X, y = make_blobs(n_samples=..., n_features=2, random_state=42, centers=[[-0.5, -1], [1, 1]])
y = y.reshape((y.shape[0], 1))

an1 = SingleLayerNeuralNetwork(...)
print('* init model params: {}'.format(an1.W.tolist()))
print(an1.loss(X, y, bias_trick=True))
an1.fit(X, y, n_epochs=..., method='batch', save_fig=False)
print('* new model params: {}'.format(an1.W.tolist()))

Now we have trained our model, plot the results

In [None]:
an1.plot_model()

Now try to train another network using SGD. Use only 1 epoch since with SGD, we are updating the weights with every training point (so $n$ times per epoch).

In [None]:
an2 = SingleLayerNeuralNetwork(eta=0.1, rand_seed=2)
print('* init model params: {}'.format(an2.W.tolist()))
an2.fit(X, y, n_epochs=1, method='SGD', save_fig=False)
print('* new model params: {}'.format(an2.W.tolist()))

plot the difference in terms of loss evolution using batch or stochastic gradient descent

In [None]:
plt.plot(an1.loss_history[:], label='batch GD')
plt.plot(an2.loss_history[0::100], label='stochastic GD')
plt.ylim(0, 2000)
plt.legend()
plt.show()

In [None]:
an2.plot_model()

## Logistic regression

Our single layer network using the logistic function for activation is very similar to the logistic regression we saw in a previous tutorial. We can easily compare our result with the logistic regression using `sklearn` toolbox.

In [None]:
from sklearn.linear_model import LogisticRegression

X, y = make_blobs(n_samples=1000, n_features=2, random_state=42, centers=[[-0.5, -1], [1, 1]])
log_reg = LogisticRegression(solver='lbfgs')
log_reg.fit(X, y)
print(log_reg.coef_)
print(log_reg.intercept_)

In [None]:
x0, x1 = np.meshgrid(
        np.linspace(-3, 3.1, 62).reshape(-1, 1),
        np.linspace(-3, 4.1, 72).reshape(-1, 1),
    )
X_new = np.c_[x0.ravel(), x1.ravel()]

y_proba = log_reg.predict_proba(X_new)
zz = y_proba[:, 1].reshape(x0.shape)

plt.figure(figsize=(4, 4))
contour = plt.contourf(x0, x1, zz, alpha=0.5)
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='gray')

# decision boundary
x_bounds = np.array([-3, 3])
boundary = -(log_reg.coef_[0][0] * x_bounds + log_reg.intercept_[0]) / log_reg.coef_[0][1]
plt.plot(x_bounds, boundary, "k-", linewidth=3)

plt.xlim(-3, 3)
plt.ylim(-3, 4)
plt.show()