In [2]:
import matplotlib.pyplot as plt
import numpy as np
import time
%matplotlib tk

In [3]:
## =============== ##
## Define our data ##
## =============== ##

# input to our model. Represents time in seconds
x_data = np.array([0,1,2]).reshape(3,1)
# outputs associated to each input. Represents cantidad de lluvia in mm^3
t_data = np.array([0.2,1.3,2.4]).reshape(3,1)

## display
plt.plot(x_data,t_data,'o', markersize = 8, label = 'data observations')
plt.xlabel('tiempo')
plt.ylabel('cantidad de lluvia')
plt.legend()

<matplotlib.legend.Legend at 0x7f01bb876f50>

In [4]:
## ======================================================= ##
## ======== functionality for computational graph ======== ##
## ======================================================= ##

## function implementing an activation function
def activation_function_linear(x):
    return x

## function that implements the computational graph
def computation_graph_linear(x,w,b):
    ''' This function represents a computational graph, a neural network, that implements a linear operation'''
    # this is the W^0 x from the theory above implemented using a transposition ;)
    y = activation_function_linear(np.matmul(x,w) + b)
    return y

## function that implements the computational graph
def computation_graph_linear_just_weight(x,w):
    ''' This function represents a computational graph, a neural network, that implements a linear operation, with no weight'''
    # this is the W^0 x from the theory above implemented using a transposition ;)
    y = activation_function_linear(np.matmul(x,w))
    return y

## function that initializes the values of a computational graph
def create_computation_graph_linear(n_in,n_out):
    ''' Create elements of the computational graph'''
    # parameters
    w = np.random.randn(n_in,n_out) + 1 # get a random value from standard normal distribution
    b = np.random.randn(n_out,)*5 # get a random value from Gaussian with mean 0 and standard deviation 5.

    return w,b

## function implementing squared loss function
def squared_loss_function_just_weight(x,t,w):
    y_pred = activation_function_linear(np.matmul(x,w))
    return (y_pred-t)**2

def grad_squared_loss_just_weight(x,t,w):
    # forward operation
    y_pred = activation_function_linear(np.matmul(x,w))
    
    # backward operation (compute gradients / backpropagation / reverse mode autodiff)
    grad_w = np.sum(2*(y_pred-t)*x, axis = 0, keepdims = True)
    
    return grad_w

In [9]:
## ====================================== ##
## ========== Gradient Descent ========== ##
## ====================================== ##

## number of points in the domain used to plot the functions 
N_points_domain = 100
x_range = np.linspace(-1,4, N_points_domain).reshape((N_points_domain,1))

## specify our computational graph
n_in = 1
n_out = 1

## first of all draw loss function against a set of parameters
w_range = np.linspace(-10,15,500).reshape((500,n_in,n_out))

loss_range = squared_loss_function_just_weight(x_data,t_data,w_range)

## accumulate loss per datapoint
loss_acc_range = np.sum(loss_range, axis = 1)

## squeeze and display
loss_acc_range = np.squeeze(loss_acc_range)
w_range = np.squeeze(w_range)

# plot grid
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 6))

# display loss function
ax1.plot(w_range, loss_acc_range, color = 'C0')
ax1.set_xlabel('Weight')
ax1.set_ylabel('Loss')

# Get loss at initialization
initial_w = np.array([9]).reshape(n_in,n_out)
initial_loss = np.sum(squared_loss_function_just_weight(x_data,t_data,initial_w))

# display loss
ax1.plot(np.squeeze(initial_w), initial_loss, '*', color = 'C1')
ax1.set_ylim([-100,900])

# display data with initialized network
initial_function = computation_graph_linear_just_weight(x_range, initial_w)

ax2.plot(x_range,initial_function, color = 'C1')
ax2.plot(x_data,t_data,'o', markersize = 8, label = 'data observations')
ax2.text(1, 13, f"squared loss = {initial_loss:.2f}", fontsize=12, va='bottom', color = f"C1" ) 
ax2.set_xlabel('tiempo')
ax2.set_ylabel('cantidad de lluvia')
ax2.set_ylim([-20,10])

fig.canvas.draw()
fig.canvas.flush_events()
time.sleep(0.5)

## gradient descent parameters
lr = 0.01 # try 0.1, 0.01, 0.15, 0.21 to show: fast convergence, slow convergence, convergence with bumping, divergence
epochs = 100

w = initial_w
for i in range(epochs):

    ## forward plus backward
    grad_w = grad_squared_loss_just_weight(x_data,t_data,w)

    ## plot new function and loss
    function = computation_graph_linear_just_weight(x_range, w)
    loss = np.sum(squared_loss_function_just_weight(x_data,t_data,w))

    ## get the gradient function at the point w_old
    gradient_function_w_at_current_w = grad_w * w_range + loss - grad_w * w

    # Clear previous data
    ax1.clear()  
    ax2.clear()

    ## display loss function
    ax1.plot(w_range, loss_acc_range, color = 'C0', label = 'loss', zorder = 20)
    ax1.plot(w_range, np.squeeze(gradient_function_w_at_current_w), color = 'C2', label = 'gradient function', zorder = 20)
    ## display new weight after update
    ax1.plot(np.squeeze(w), loss, '*', color = 'C1', label = 'current weight', zorder = 20)
    ax1.text(np.squeeze(w) + 2, loss , f"w = {np.squeeze(w):.2f} \ngrad_w = {float(grad_w):.2f} \nw_new = {np.squeeze(w):.2f} -{lr:.2f}*{np.squeeze(grad_w):.2f} = {np.squeeze(w-lr*grad_w):.2f}", fontsize=12, va='bottom', color = f"C3" , zorder = 50) 
    
    ax1.set_xlabel('Weight')
    ax1.set_ylabel('Loss')
    ax1.set_ylim([-100,900])
    ax1.legend()
    
    ax2.plot(x_range,function, color = 'C1')
    ax2.plot(x_data,t_data,'o', markersize = 8, label = 'data observations')
    ax2.text(1, 13, f"squared loss = {loss:.2f}", fontsize=12, va='bottom', color = f"C1" ) 
    ax2.set_xlabel('tiempo')
    ax2.set_ylabel('cantidad de lluvia')
    ax2.set_ylim([-20,10])

    fig.canvas.draw()
    fig.canvas.flush_events()
    time.sleep(3)
    ax1.vlines(np.squeeze(w), ymin=-100, ymax=loss, color='C3', linestyles='dotted')
    time.sleep(3)
    ax1.hlines(y = loss, xmin=np.squeeze(w-lr*grad_w), xmax=np.squeeze(w), color='C3', linestyles='dotted')
    time.sleep(3)
    ax1.vlines(np.squeeze(w-lr*grad_w), ymin=-100, ymax=loss, color='C3', linestyles='dotted')
    time.sleep(3)
    break

    ## update parameter with gradient descent
    w = w-lr*grad_w