In [5]:
import matplotlib.pyplot as plt
import numpy as np
import time
%matplotlib tk

In [13]:
## =============== ##
## Define our data ##
## =============== ##
color_c0 = 'C0'
color_c1 = 'C1'

# input to our model. Represents time in seconds
x_data = np.array([0,1.5,2,2.5,3,4,5]).reshape(7,1)
# outputs associated to each input. Represents cantidad de lluvia in mm^3
t_data = np.array([0,0,0,0,1,1,1]).reshape(7,1)

## display
idx_class0 = t_data == 0
idx_class1 = t_data == 1
plt.plot(x_data[idx_class0],t_data[idx_class0],'o', color = color_c0, markersize = 8, label = 'data observations class 0')
plt.plot(x_data[idx_class1],t_data[idx_class1],'*', color = color_c1,markersize = 8, label = 'data observations class 1')
plt.xlabel('imagen')
plt.ylabel('clase asociada')
plt.legend()

<matplotlib.legend.Legend at 0x7fe7921ed190>

In [7]:
## ======================================================= ##
## ======== functionality for computational graph ======== ##
## ======================================================= ##

## activation function sigmoid
def activation_function_sigmoid(x):
    return 1. / (1. + np.exp(-x))
    
## function that implements the computational graph
def computation_graph_linear(x,w,b):
    ''' This function represents a computational graph, a neural network, that implements a linear operation'''
    # this is the W^0 x from the theory above implemented using a transposition ;)
    y = activation_function_sigmoid(np.matmul(x,w) + b)
    return y

## function that implements the computational graph
def computation_graph_linear_just_weight(x,w):
    ''' This function represents a computational graph, a neural network, that implements a linear operation, with no weight'''
    # this is the W^0 x from the theory above implemented using a transposition ;)
    y = activation_function_sigmoid(np.matmul(x,w))
    return y

## function that initializes the values of a computational graph
def create_computation_graph_linear(n_in,n_out):
    ''' Create elements of the computational graph'''
    # parameters
    w = np.random.randn(n_in,n_out) + 1 # get a random value from standard normal distribution
    b = np.random.randn(n_out,)*5 # get a random value from Gaussian with mean 0 and standard deviation 5.

    return w,b

## function implementing brier score loss function (yeah it is like squared loss)
def brier_loss_function_just_weight(x,t,w):
    y_pred = activation_function_sigmoid(np.matmul(x,w))
    return (y_pred-t)**2

## function implementing binary cross entropy loss
def binary_cross_entropy_loss_function_just_weight(x,t,w):
    y_pred = activation_function_sigmoid(np.matmul(x,w))
    loss = np.zeros(y_pred.shape)

    # expand dimension on t to match
    t = np.tile(t, y_pred.shape[0:-2]+(1,1))
    
    loss[t==1] = t[t==1]*np.log(y_pred[t==1])
    loss[t==0] = (1-t[t==0])*np.log(1-y_pred[t==0])
    return -1*loss

def grad_activation_function_sigmoid(x):
    return activation_function_sigmoid(x) * (1-activation_function_sigmoid(x))

def grad_brier_loss_just_weight(x,t,w):
    ## forward operation
    z = np.matmul(x,w)
    y_pred = activation_function_sigmoid(z)
    
    ## Backward operation (compute gradients / backpropagation / reverse mode autodiff), 
    #  applying chain rule (just one gradient missing for real reverse mode autodiff) :)
    dC_dy = 2*(y_pred-t)
    dy_dz = grad_activation_function_sigmoid(z)
    dz_dw = x

    # compute the gradient by applying chain rule: dC/dW = dC/dy * dy/dz * dz/dw
    grad_w = np.sum( dC_dy * dy_dz * dz_dw, axis = 0, keepdims = True)
    
    return grad_w

def grad_binary_cross_entropy_loss_just_weight(x,t,w):
    # forward operation
    z = np.matmul(x,w)
    y_pred = activation_function_sigmoid(z)
    
    ## Backward operation (compute gradients / backpropagation / reverse mode autodiff)
    # applying chain rule (just one gradient missing for real reverse mode autodiff) :)
    dC_dy = np.zeros((len(y_pred),1))
    dC_dy[t==1] = t[t==1]*np.log(1/y_pred[t==1])
    dC_dy[t==0] = -(1-t[t==0])*np.log(1/(1-y_pred[t==0])) 
    dC_dy = -1*dC_dy

    dy_dz = grad_activation_function_sigmoid(z)
    dz_dw = x

    # compute the gradient by applying chain rule: dC/dW = dC/dy * dy/dz * dz/dw
    grad_w = np.sum( dC_dy * dy_dz * dz_dw, axis = 0, keepdims = True)
     
    return grad_w


In [12]:
## ====================================== ##
## ========== Gradient Descent ========== ##
## ====================================== ##
## select loss function to work at
loss_name = 'bce'
sleep_time_simulation = 0.5

## number of points in the domain used to plot the functions 
N_points_domain = 100
x_range = np.linspace(-1,7, N_points_domain).reshape((N_points_domain,1))

## specify our computational graph
n_in = 1
n_out = 1

## first of all draw loss function against a set of parameters
w_range = np.linspace(-2,4,500).reshape((500,n_in,n_out))

if loss_name == 'brier':
    loss_range = brier_loss_function_just_weight(x_data,t_data,w_range)
elif loss_name == 'bce':
    loss_range = binary_cross_entropy_loss_function_just_weight(x_data,t_data,w_range)
else:
    raise NotImplementedError(f"Unkown loss {loss_name}, choose from brier or bce")
    
## accumulate loss per datapoint
loss_acc_range = np.sum(loss_range, axis = 1)

## squeeze and display
loss_acc_range = np.squeeze(loss_acc_range)
w_range = np.squeeze(w_range)

## get some limits for plots
y_lim_min = np.min(loss_acc_range) - 0.5*np.std(loss_acc_range)
y_lim_max = np.max(loss_acc_range) + 0.5*np.std(loss_acc_range)
y_lim_inc = 0.1*np.std(loss_acc_range)

# plot grid
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 6))

# display loss function
ax1.plot(w_range, loss_acc_range, color = 'C0')
ax1.set_xlabel('Weight')
ax1.set_ylabel('Loss')
ax1.set_ylim([y_lim_min,y_lim_max])

# Initialize parameters
w = np.array([-0.75]).reshape(n_in,n_out) # initialize at 4.

## gradient descent parameters
lr = 0.1
epochs = 20

for e in range(epochs):

    ## forward plus backward
    if loss_name == 'brier':
        grad_w = grad_brier_loss_just_weight(x_data,t_data,w)
    else:
        grad_w = grad_binary_cross_entropy_loss_just_weight(x_data,t_data,w)

    ## compute function at current parameter value
    function = computation_graph_linear_just_weight(x_range, w)

    ## compute predictions at current parameter value
    y_pred = computation_graph_linear_just_weight(x_data, w)

    ## compute loss at current parameter value
    if loss_name == 'brier':
        loss = brier_loss_function_just_weight(x_data,t_data,w)
    else:
        loss = binary_cross_entropy_loss_function_just_weight(x_data,t_data,w)
    
    loss_acc = np.sum(loss)

    ## get the gradient function at the point w (tangent at the point)
    gradient_function_w_at_current_w = grad_w * w_range + loss_acc - grad_w * w

    ## ============= ##
    ## ============= ##
    ## START DRAWING ##
    ## ============= ##
    ## ============= ##
    # Clear previous data
    ax1.clear()  
    ax2.clear()
    
    w_plot = np.squeeze(w)
    grad_w_plot = np.squeeze(grad_w)
    x_data_plot = np.squeeze(x_data)
    t_data_plot = np.squeeze(t_data)
    y_pred_plot = np.squeeze(y_pred)
    loss_plot = np.squeeze(loss)
    
    # get new weight after grad descent. Just for illustration purposes, the real step is done at the end of the loop
    w_new_plot = np.squeeze(w-lr*grad_w)

    ## ================ ##
    ## function picture ##
    idx_class0 = t_data == 0
    idx_class1 = t_data == 1
    ax2.plot(x_range,function, color = 'C3', label = 'function: y = sigmoid(w*x)')
    ax2.plot(x_data[idx_class0],t_data[idx_class0],'o', color = color_c0, markersize = 8, label = 'data observations class 0')
    ax2.plot(x_data[idx_class1],t_data[idx_class1],'*', color = color_c1,markersize = 8, label = 'data observations class 1')
    
    ## plot loss associated at each point and draw line between dots to highliht what the loss measures
    for idx, (xi, ti, yi, sl) in enumerate(zip(x_data_plot,t_data_plot,y_pred_plot,loss_plot)):
        if idx == 0:
            ax2.plot(xi,yi, 'x', color = 'C2', label = 'network prediction')
        else:
            ax2.plot(xi,yi, 'x', color = 'C2')
        ax2.plot([xi,xi], [ti, yi], '--',color = f"C1", alpha = 0.5)
        ax2.text(xi, yi, f'{sl:.2f}', fontsize=12, va='top', color = f"C2" ) 

    # label function with the weight at that moment
    ax2.text(x_range[-20],function[-20], f'w = {w_plot}', color = 'k', fontsize = 12)
    
    ax2.text(1, 1.65, f"Iteration {e}, {loss_name} loss = {loss_acc:.3f}", fontsize=12, va='bottom', color = f"C2" ) 
    ax2.set_xlabel('imagen')
    ax2.set_ylabel('clase asociada')
    ax2.set_ylim([-0.5,1.5])
    ax2.legend()

    ## draw
    fig.canvas.draw()
    fig.canvas.flush_events()
    
    ## ===================== ##
    ## loss function picture ##
    
    ## 0. label and axis limits
    ax1.set_xlabel('Weight')
    ax1.set_ylabel('Loss')
    ax1.set_ylim([y_lim_min,y_lim_max])

    ## 1. display loss function
    ax1.plot(w_range, loss_acc_range, color = 'C0', label = 'loss', zorder = 20)
    
    ## 2. display current weight
    ax1.plot(w_plot, y_lim_min + 0.5*y_lim_inc, '*', color = 'C1', label = 'current weight', zorder = 50, markersize = 10)
    ax1.text(w_plot, y_lim_min + 0.5*y_lim_inc , f"w = {w_plot:.3f}", fontsize=12, va='bottom', color = f"C1" , zorder = 50)
    ax1.legend()
    
    fig.canvas.draw()
    fig.canvas.flush_events()
    time.sleep(sleep_time_simulation)

    ## animation by drawing horizontal lines on current parameter and updated parameter values
    ax1.vlines(np.squeeze(w), ymin=0, ymax=loss_acc, color='k', linestyles='dotted', zorder = -50)

    ## 3. display current loss
    ax1.plot(w_plot, loss_acc, 'o', color = 'C0', label = 'loss at current weight', zorder = 20)
    ax1.text(w_plot + 0.5, loss_acc , f"loss = {loss_acc:.3f}", fontsize=12, va='bottom', color = "C0" , zorder = 50)
    ax1.legend()
    
    fig.canvas.draw()
    fig.canvas.flush_events()
    time.sleep(sleep_time_simulation)
    
    ## 4. display the gradient function
    ax1.plot(w_range, np.squeeze(gradient_function_w_at_current_w), color = 'C2', label = 'gradient function: f(w) = grad_w * w + loss - grad_w * w', zorder = 20)
    ax1.text(w_range[-1], np.squeeze(gradient_function_w_at_current_w)[-1], f"grad_w = {grad_w_plot:.3f}", fontsize=12, va='bottom', color = f"C2" , zorder = 200) 
    ax1.legend()
    
    fig.canvas.draw()
    fig.canvas.flush_events()
    time.sleep(sleep_time_simulation)

    ## draw rest of lines to show update
    ax1.hlines(y = loss_acc, xmin=w_new_plot, xmax=w_plot, color='k', linestyles='dotted', zorder = -50)
    
    fig.canvas.draw()
    fig.canvas.flush_events()
    time.sleep(sleep_time_simulation)
    
    ax1.vlines(w_new_plot, ymin=y_lim_min, ymax=loss_acc, color='k', linestyles='dotted', zorder = -50)
    
    fig.canvas.draw()
    fig.canvas.flush_events()
    time.sleep(sleep_time_simulation)
    
    ## 5. display new weight
    ax1.plot(w_new_plot, y_lim_min+ 0.5*y_lim_inc, '*', color = 'C3', label = 'updated weight: w_new = w - lr*grad_w', zorder = 200, markersize = 10)
    ax1.text(w_new_plot, y_lim_min+1.5*y_lim_inc, f"w_new = {w_plot:.3f} -{lr:.3f}*{grad_w_plot:.3f} = {w_plot-lr*grad_w_plot:.3f}", fontsize=12, va='bottom', color = f"C3" , zorder = 200) 
    ax1.legend()
    
    fig.canvas.draw()
    fig.canvas.flush_events()
    time.sleep(sleep_time_simulation)

    ## wait to see
    time.sleep(sleep_time_simulation)

    ## update parameter with gradient descent, for the next update
    w = w-lr*grad_w