Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup ridge auto tuning #162

Merged
merged 1 commit into from
Dec 3, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 64 additions & 70 deletions docs/src/examples/autotuning-ridge.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,105 +22,96 @@
# computed with DiffOpt, we can perform a gradient descent on top of the inner model
# to minimize the test loss.

using DiffOpt
using Statistics
using OSQP
using JuMP
using Plots
import Random
using LinearAlgebra

# This tutorial uses the following packages

"""
R2(y_true, y_pred)

Return the coefficient of determination R2 of the prediction in `[0,1]`.
"""
function R2(y_true, y_pred)
u = norm(y_pred - y_true)^2 # Regression sum of squares
v = norm(y_true .- mean(y_true))^2 # Total sum of squares
return 1 - u/v
end
using JuMP # The mathematical programming modelling language
import DiffOpt # JuMP extension for differentiable optimization
import OSQP # Optimization solver that handles quadratic programs
import Plots # Graphing tool
import LinearAlgebra: norm, dot
import Random

# ## Generating a noisy regression dataset

function create_problem(N, D, noise)
w = 10 * randn(D)
X = 10 * randn(N, D)
y = X * w + noise * randn(N)
l = N ÷ 2 # test train split
return (X[1:l, :], X[l+1:N, :], y[1:l], y[l+1:N])
end

Random.seed!(42)

X_train, X_test, y_train, y_test = create_problem(1000, 200, 50);
N = 100
D = 20
noise = 5

w_real = 10 * randn(D)
X = 10 * randn(N, D)
y = X * w_real + noise * randn(N)
l = N ÷ 2 # test train split

X_train = X[1:l, :]
X_test = X[l+1:N, :]
y_train = y[1:l]
y_test = y[l+1:N];

# ## Defining the regression problem

# We implement the regularized regression problem as a function taking the problem data,
# building a JuMP model and solving it.

function fit_ridge(X, y, α, model = Model(() -> diff_optimizer(OSQP.Optimizer)))
function fit_ridge(model, X, y, α)
JuMP.empty!(model)
set_optimizer_attribute(model, MOI.Silent(), true)
set_silent(model)
N, D = size(X)
@variable(model, w[1:D])
err_term = X * w - y
@objective(
model,
Min,
1/(2 * N * D) * dot(err_term, err_term) + 1/(2 * D) * α * dot(w, w),
dot(err_term, err_term) / (2 * N * D) + α * dot(w, w) / (2 * D),
)
optimize!(model)
if termination_status(model) != MOI.OPTIMAL
error("Unexpected status: $(termination_status(model))")
end
regularized_loss_value = objective_value(model)
training_loss_value = 1/(2 * N * D) * JuMP.value(dot(err_term, err_term))
return model, w, value.(w), training_loss_value, regularized_loss_value
@assert termination_status(model) == MOI.OPTIMAL
return w
end

# We can solve the problem for several values of α
# to visualize the effect of regularization on the testing and training loss.

αs = 0.0:0.01:0.5
Rs = Float64[]
αs = 0.05:0.01:0.35
mse_test = Float64[]
mse_train = Float64[]
model = JuMP.Model(() -> diff_optimizer(OSQP.Optimizer))
model = Model(() -> DiffOpt.diff_optimizer(OSQP.Optimizer))
(Ntest, D) = size(X_test)
(Ntrain, D) = size(X_train)
for α in αs
_, _, w_train, training_loss_value, _ = fit_ridge(X_train, y_train, α, model)
y_pred = X_test * w_train
push!(Rs, R2(y_test, y_pred))
push!(mse_test, dot(y_pred - y_test, y_pred - y_test) / (2 * Ntest * D))
push!(mse_train, training_loss_value)
w = fit_ridge(model, X_train, y_train, α)
ŵ = value.(w)
ŷ_test = X_test * ŵ
ŷ_train = X_train * ŵ
push!(mse_test, norm(ŷ_test - y_test)^2 / (2 * Ntest * D))
push!(mse_train, norm(ŷ_train - y_train)^2 / (2 * Ntrain * D))
end

# Visualize the R2 correlation metric

plot(αs, Rs, label=nothing, xaxis="α", yaxis="R2")
title!("Test coefficient of determination R2")

# Visualize the Mean Score Error metric

plot(αs, mse_test ./ sum(mse_test), label="MSE test", xaxis = "α", yaxis="MSE", legend=(0.8,0.2))
plot!(αs, mse_train ./ sum(mse_train), label="MSE train")
title!("Normalized MSE on training and testing sets")
Plots.plot(
αs, mse_test ./ sum(mse_test),
label="MSE test", xaxis = "α", yaxis="MSE", legend=(0.8, 0.2)
)
Plots.plot!(
αs, mse_train ./ sum(mse_train),
label="MSE train"
)
Plots.title!("Normalized MSE on training and testing sets")

# ## Leveraging differentiable optimization: computing the derivative of the solution

# Using DiffOpt, we can compute `∂w_i/∂α`, the derivative of the learned solution `̂w`
# w.r.t. the regularization parameter.

function compute_dw_dparam(model, w)
function compute_dw_dα(model, w)
D = length(w)
dw_dα = zeros(D)
MOI.set(
model,
DiffOpt.ForwardInObjective(),
1/2 * dot(w, w) / D,
dot(w, w) / (2 * D),
)
DiffOpt.forward(model)
for i in 1:D
Expand All @@ -133,13 +124,13 @@ function compute_dw_dparam(model, w)
return dw_dα
end

# Using `∂w_i/∂α` computed with `compute_dw_dparam`,
# Using `∂w_i/∂α` computed with `compute_dw_dα`,
# we can compute the derivative of the test loss w.r.t. the parameter α
# by composing derivatives.

function d_testloss_dα(model, X_test, y_test, w, ŵ)
N, D = size(X_test)
dw_dα = compute_dw_dparam(model, w)
dw_dα = compute_dw_dα(model, w)
err_term = X_test * ŵ - y_test
return sum(eachindex(err_term)) do i
dot(X_test[i,:], dw_dα) * err_term[i]
Expand All @@ -151,31 +142,34 @@ end

function descent(α0, max_iters=100; fixed_step = 0.01, grad_tol=1e-3)
α_s = Float64[]
test_loss_values = Float64[]
test_loss = Float64[]
α = α0
model = JuMP.Model(() -> DiffOpt.diff_optimizer(OSQP.Optimizer))
N, D = size(X_test)
model = Model(() -> DiffOpt.diff_optimizer(OSQP.Optimizer))
for iter in 1:max_iters
push!(α_s, α)
_, w, ŵ, _, = fit_ridge(X_train, y_train, α, model)
w = fit_ridge(model, X_train, y_train, α)
ŵ = value.(w)
err_term = X_test * ŵ - y_test
test_loss = norm(err_term)^2 / (2 * length(X_test))
push!(
test_loss_values,
test_loss,
)
push!(α_s, α)
push!(test_loss, norm(err_term)^2 / (2 * N * D))
∂α = d_testloss_dα(model, X_test, y_test, w, ŵ)
α -= fixed_step * ∂α
if abs(∂α) ≤ grad_tol
break
end
end
return α_s, test_loss_values
return α_s, test_loss
end

ᾱ, msē = descent(0.1, 500);
ᾱ_l, msē_l = descent(0.10, 500);
ᾱ_r, msē_r = descent(0.33, 500);

# Visualize gradient descent and convergence

plot(αs, mse_test, label="MSE test", xaxis = ("α"), legend=:topleft)
plot!(ᾱ, msē, label="learned α", lw = 2)
title!("Regularizer learning")
Plots.plot(
αs, mse_test,
label="MSE test", xaxis = ("α"), legend=:topleft
)
Plots.plot!(ᾱ_l, msē_l, label="learned α, start = 0.10", lw = 2)
Plots.plot!(ᾱ_r, msē_r, label="learned α, start = 0.33", lw = 2)
Plots.title!("Regularizer learning")