In [None]:
# library containing all previously implemented optimization algorithms
# --> will be continously expanded within this course
include("optimization_library.jl");
include("mplstyle.jl");

# Exercise 3.2a
Implement the Newton Method

In [None]:
# f: Objective function
# df: Gradient of the objective function
# Hf: Hessian of the objective function
# x0: Initial point
# ls: Use linesearch?
# eps: Stopping criterion on the newton decrement
# maxiters: Max number of iterations

function newton_descent(f, df, Hf, x0; ls=true, eps=0.000001, maxiters=1000)
    
    # make a copy of initial point to prevent changing x0 by manipulating x
    x = copy(x0)
    # store the trace of the descent path
    trace = [x; f(x)]
    
    for _=1:maxiters
        # ==========================================================
        # 1. evaluate the newton step d_nt (see Exercise 3.1)
        # 2. compute the newton decrement λ and break the loop if λ^2/2 < eps
        # 3. if (ls = true) reduce the step length by performing backtracking linesearch
        # 4. update the point x
        # ===========================================================
        trace = hcat(trace, [x; f(x)])
    end
    return x,trace
end;

# Exercise 3.2b: Marathon Training Reviewed

<img src="figures/laufbahn.jpeg" width="350">

Remember the Marathon training from Exercise 2 where we found the optimal model for the given data by applying gradient descent. Now we want to compare with the result from the Newton Method.

We still assume a linear model with velocity $v$ scaling linearly with time $t$:
\begin{equation}
    p(t) = vt
\end{equation}

In [None]:
# Given Dataset

# times in minutes
t = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]

# distances in kilometers
d = [1.88, 4.47, 5.63, 8.13, 8.54, 11.23, 12.27, 14.23, 15.50, 16.93, 18.69, 21.31];

# linear model
function m(t,v)
    # The times are measured in minutes. We convert them in units of hours by dividing by 60. Like this, the velocity
    # has units of km/h
    return t.*(v/60)
end

In [None]:
# loss function
loss(v) = sum((d-m(t,v)).^2)

# Gradient of the loss function
function dloss(v)
    inner = -t./60
    outer = 2*(d-m(t,v))
    return inner' * outer
end

# ===============================================
# Define the Hessian of the loss function
H_loss(v) = 0
# ===============================================

In [None]:
x0 = 1

# Result from Gradient Descent
result_grd,trace_grd = gradient_descent(loss,dloss, x0,maxiters = 10000, eps = 0.0001, p = 0.5)
println("Optimal velocity [km/h]: ",result_grd)
println("Optimal loss: ",trace_grd[end,end])
println("Gradient at optimum: ",dloss(result_grd))
println("Iterations: ", size(trace_grd,2))

In [None]:
# Result of Newton Method
# ==========================================================
# Perform the minimization using the Newton method.
# Print the optimal parameters and the number of Iterations.
# ==========================================================

In [None]:
# Test if the results from gradient descent and Newton method are the same
# ==========================================================
# uncomment the following line
#@assert result_nwt - result_grd < 0.0001
# ==========================================================

In [None]:
# plot the trace of the two algorithms

# create values of velocities for which the loss function should be evaluated
test_velocities = range(1,15,length = 100)

# evaluate the loss function for some test velocities
losses = zeros(size(test_velocities,1))
for (index,test) in enumerate(test_velocities)
    losses[index] = loss(test)
end

plot(test_velocities',losses, color = "blue")
plot(trace_grd[1,:],trace_grd[2,:], marker = "o", color = "green", label = "Gradient descent")

# =============================================================================================
# add the trace of the Newton Method to the plot
# =============================================================================================

xlabel("v [km/h]")
ylabel(L"$l(v)$")

legend()
title("Fit of linear Model")

# Exercise 3.2c

The linear model contains only one parameter that is estimated. However, the performance of gradient descent and Newton method differ most severely on high-dimensional problems, i.e. when the model contains a high number of parameters. Hence, we use the dataset from the marathon training an perform a fit of a higher-order polynomial (irrespectively if we know that this makes sense or not). We are interested to see how many iterations each method needs to reach the optimal combination of parameters $\theta_i$.

A polynomial of degree $d$ is given by:
\begin{equation}
P_d(\mathbf{t}) = \sum_{j = 0}^d \theta_j \mathbf{t}^j
\end{equation}
Note that $\mathbf{t}$ is a vector containing all measured times during the marathon training. In this case $\mathbf{t}^j$ is the vector resulting from evaluating the $j$-th power of each element in $\mathbf{t}$:
\begin{equation}
\mathbf{t}^j = (t_1^j, t_2^j, \dots, t_n^j)
\end{equation}

In [None]:
# generic implementation of a polynomial of arbitrary degree d
function model_poly(t, theta)
    # get the number of parameters. The polynomial degree d is given by d = n - 1
    n = size(theta,1)
    return ((t./60) .^ [0:n-1;]') * theta
end

# ===============================================================================
# implement the quadratic loss between the datapoints and model_poly(t,params)
# ===============================================================================


# gradient of the loss function
function dloss_poly(theta)
    # get the number of parameter of the polynomial model
    n = size(theta,1)
    dloss = zeros(n)
    # ===============================================================================
    # Implement the gradient vector
    # Note that you always have to divide the measured times t by 60 in order to be consistent
    # with the rest of the notebook
    # ===============================================================================
    return dloss
end

# Hessian of the loss function
function H_loss_poly(theta)
    # get the number of parameter of the polynomial model
    n = size(theta,1)
    H_loss = zeros(n,n)
    # ===============================================================================
    # Implement the hessian matrix
    # Note that you always have to divide the measured times t by 60 in order to be consistent
    # with the rest of the notebook
    # ===============================================================================
    return H_loss
end

In [None]:
# define the number of paramaters in the polynomial model
n_theta = 3

# create an array of random numbers to get a starting point
x0 = 10*rand(n_theta)

# Result from Gradient Descent
# ===============================================================================
# uncomment the following lines
#result_grd2,trace_grd2 = gradient_descent(loss_poly,dloss_poly, x0, maxiters = 10000, eps = 0.0001, p = 0.5)
#println("Minimizer: ",result_grd2)
#println("Optimal loss: ",trace_grd2[end,end])
#println("Gradient at optimum: ",dloss_poly(result_grd2))
#println("Iterations: ", size(trace_grd2,2))
# ===============================================================================

In [None]:
# Result of Newton Method
# ==========================================================
# Perform the minimization of the polynomial loss using the Newton method.
# Print the optimal parameters and the number of Iterations.
# ==========================================================

In [None]:
# Plot linear and higher order polynomial models in order to compare to data points

# create more points for which the model can be plotted in order to get a smooth curve
times_plot = range(0,stop = 140)
    
# plot data
scatter(t,d,color = "black",zorder=3, label = "data")

# plot models
plot(times_plot,m(times_plot,result_grd),color = "red",label="linear model")

# ===============================================================================
# add the curve of the best fitting polynomial model evaluated by the newton method
# ===============================================================================

xlabel("t [min]")
ylabel("d [km]")

legend(loc = "upper left");