---
# JuMP.jl

In [None]:
function ncmjump(U, H; method=:PG, accelerate=true, τ=1.0, kmax=30, verbose=true)
    
    issymmetric(U) || error("U must be symmetric")
    issymmetric(H) || error("H must be symmetric")
    
    n = size(U, 1)
    H2 = Symmetric(H.*H)
    
    M = Symmetric(zeros(n,n))
    
    # Loss function and gradient
    f(X) = 0.5*norm(M.data .= H.*(X .- U))^2
    ∇f(X) = Symmetric(H2.*(X .- U))

    # Lipschitz constant of ∇f
    L = norm(H2)

    # Quadratic model of f at Y
    Q(X,Y,L,τ) = f(Y) + dot(∇f(Y), X - Y) + L/2τ*dot(X - Y, X - Y)

    # Initialize model with correlation matrix constraints
    m = Model(with_optimizer(CSDP.Optimizer))
    #m = Model(with_optimizer(COSMO.Optimizer))
    MOI.set(m, MOI.Silent(), true)
    @variable(m, X[1:n,1:n], Symmetric)
    @constraint(m, diagcon, diag(X) .== 1)
    @constraint(m, psdcon, X in PSDCone())

    #X0 = Symmetric(diagm(ones(n)))
    X0 = Symmetric(zeros(n,n))
    fvals = [f(X0)]
    
    y = zeros(n)
    Λ = Symmetric(zeros(n,n))
    Γ = copy(Λ)
    V = copy(Λ)
    ∇fY = copy(Λ)
    Y = copy(X0)
    Xold = copy(X0)
    Xnew = copy(X0)
    t = 1.0
    
    for k = 1:kmax
        
        # Solve quadratic subproblem
        @objective(m, MOI.FEASIBILITY_SENSE, 0)
        @objective(m, Min, Q(X,Y,L,τ))
        optimize!(m)
        Xnew.data .= value.(X)

        # Ensure that diag(Xnew).==1 exactly
        d = diag(Xnew)
        for i=1:n
            d[i] = 1/sqrt(Xnew[i,i])
        end
        for j=1:n
            for i=1:n
                Xnew.data[i,j] = d[i]*d[j]*Xnew.data[i,j]
            end
        end
                
        # Evaluate loss function
        fnew = f(Xnew)
        Qnew = Q(Xnew,Y,L,τ)
        push!(fvals, fnew)

        # Obtain dual multipliers
        y .= dual.(diagcon)
        copy!(Λ, dual.(psdcon))
        
        # Compute subgradient Γ ∈ ∂g(X; ε)
        Γ.data .= -Λ .- Diagonal(y)

        # Compute dual residual
        ∇fY.data .= H2.*(Y .- U)
        V.data .= ∇fY .+ (L/τ).*(Xnew .- Y) .+ Γ

        for i=1:n
            d[i] = 1 - Xnew[i,i]
        end
        Rp = norm(d)/(1 + √n)
        Rd = norm(M.data .= H2.*(Xnew .- U) .- Diagonal(y) .- Λ)
        
        ε = dot(Xnew, Λ)
        δ = norm(V)
        dist = norm(M.data .= Xnew .- Y)
        
        if verbose
            mod(k, 20)==1 &&
            @printf("%4s %10s %10s %10s %10s %10s %10s\n", 
                "k", "f(X)", "Rp", "Rd", "<X,Λ>", "||V||", "||X-Y||")
            @printf("%4d %10.2e %10.2e %10.2e %10.2e %10.2e %10.2e\n", 
                k, fvals[end], Rp, Rd, ε, δ, dist)
        end
        
        #=
        if method==:IR
            (τ*δ)^2 + 2τ*ε*L ≤ (1-τ)*(L*dist)^2 || 
                @warn("(τ*δ)^2 + 2τ*ε*L ≤ (1-τ)*(L*dist)^2 fails")
        end
        =#

        #=
        if verbose
            mod(k, 20)==1 &&
            @printf("%4s %8s %8s %10s %10s %10s %6s\n", 
                "k", "ε", "δ", "||X - Y||", "f(X)", "Q(X, Y)", "t")
            @printf("%4d %8.1e %8.1e %10.1e %10.2e %10.2e %6.2f\n", 
                k, ε, δ, dist, fnew, Qnew, t)
        end
        =#

        # Update Y, Xold, t
        if accelerate
            tnew = (1 + √(1 + 4t^2))/2
        else
            tnew = 1.0
        end
        
        if method==:IR
            Y.data .= Xnew .- (t/tnew*τ/L).*V .+ ((t - 1)/tnew).*(Xnew .- Xold)
        else
            Y.data .= Xnew .+ ((t - 1)/tnew).*(Xnew .- Xold)
        end
        Xold .= Xnew
        t = tnew
    end
    
    return Xnew, y, fvals
end

n = 6
U, H = randncm(n)
ncmjump(U, H, kmax=20);

In [None]:
n = 6
U, H = randncm(n)
kmax = 50
verbose = false

@time X0, y0, fvals0 = ncmjump(U, H, accelerate=false,  kmax=kmax, verbose=verbose)
@time X,  y,  fvals  = ncmjump(U, H, accelerate=true,   kmax=kmax, verbose=verbose)
@time X1, y1, fvals1 = ncmjump(U, H, method=:IR, τ=0.1, kmax=kmax, verbose=verbose)
@time X2, y2, fvals2 = ncmjump(U, H, method=:IR, τ=0.5, kmax=kmax, verbose=verbose)
@time X3, y3, fvals3 = ncmjump(U, H, method=:IR, τ=0.9, kmax=kmax, verbose=verbose)

plot(yaxis=:log, legend=:topright, size=(900,600), title="subproblem solver: JuMP with CSDP")
plot!(0:kmax, fvals0, label="PG", linestyle=:auto)
plot!(0:kmax, fvals, label="APG", linestyle=:auto)
plot!(0:kmax, fvals1, label=L"\tau = 0.1", markershape=:auto)
plot!(0:kmax, fvals2, label=L"\tau = 0.5", markershape=:auto)
plot!(0:kmax, fvals3, label=L"\tau = 0.9", markershape=:auto)
xlabel!(L"k"); ylabel!(L"f(x_k)")

---
# Optim.jl

In [None]:
function ncm2(U, H, myproj; 
        τ=1.0, 
        t0=1.0,
        f_calls_limit=1000, 
        kmax=300,
        tol=1e-2,
        g_tol=1e-2, 
        algo=GradientDescent(), 
        method=:PG, 
        accelerate=true, 
        verbose=true,
    )
        
    # Check for valid input
    issymmetric(U) || error("U must be symmetric")
    issymmetric(H) || error("H must be symmetric")
    size(U) == size(H) || error("U and H must be the same size")
    
    # Create the projection function
    n = size(U, 1)
    
    H2 = Symmetric(H.*H)
    
    # Loss function and gradient
    f(X) = norm(H.*(X .- U))
    #∇f(X) = Symmetric(H2.*(X .- U))
    
    # Lipschitz constant of ∇f
    L = norm(H2)
    
    # Memory allocation
    y = zeros(n)
    g = zeros(n)
    d = zeros(n)
    M = Symmetric(zeros(n,n))
    #Y = Symmetric(triu(zeros(n,n),1) + I)
    Y = copy(M)
    Xold = copy(M)
    Xnew = copy(M)
    Λ = copy(M)
    Γ = copy(M)
    V = copy(M)
    X = copy(M)
    ∇fY = copy(M)
    fvals = Float64[]
    
    # Evaluates dual objective function and its gradient
    function dualobj!(ff, gg, y)
        ∇fY.data .= H2.*(Y .- U)
        M.data .= Y .- (τ/L).*(∇fY .+ Diagonal(y))
        X .= M
        myproj(X)
        
        # Update Λ and Γ
        Λ.data .= (L/τ).*(X .- M)
        Γ.data .= Diagonal(y) .- Λ
        
        # Ensure that diag(Xnew).==1 exactly
        for i=1:n
            d[i] = 1/sqrt(X[i,i])
        end
        for j=1:n
            for i=1:n
                Xnew.data[i,j] = d[i]*d[j]*X[i,j]
            end
        end
        
         M.data .= H.*(Xnew .- U)
        push!(fvals, 0.5*dot(M,M))

        V.data .= ∇fY .+ (L/τ).*(Xnew .- Y) .+ Γ

        if gg != nothing
            for i=1:n
                gg[i] = 1 - X[i,i]
            end
        end
        
        if ff != nothing
            normW = norm(view(myproj.w, 1:myproj.m[]))
            return sum(y) + (L/2τ)*(normW^2 - dot(Y,Y))
        end
    end
    
    # The callback function
    function cb(os)
        #println("================ CALLBACK ================")
        #@show os[end].metadata["x"]
        
        # Compute ε and δ
        ε = max(0.0, dot(Xnew, Λ))
        δ = norm(V)
        dist = norm(M.data .= Xnew .- Y)
        
        stop_optimize = false
        if (τ*δ)^2 + 2τ*ε*L ≤ (1-τ)*(L*dist)^2
            #@show (τ*δ)^2 + 2τ*ε*L, (1-τ)*(L*dist)^2
            stop_optimize = true
        end
        #if length(fvals) >= f_calls_limit
        #    stop_optimize = true
        #end
        #println("==========================================")
        return stop_optimize
    end
    
    # Define subproblem
    prob = Optim.only_fg!(dualobj!)
    opts = Optim.Options(
            g_tol=g_tol, 
            store_trace=true, 
            extended_trace=true, 
            show_trace=false, 
            callback=cb)
    
    Rp = Rd = Inf
    k = 0
    t = t0
    while max(Rp, Rd) > tol && k < kmax && length(fvals) < f_calls_limit
        k = k + 1
        
        res = optimize(prob, y, algo, opts)
        
        #if length(fvals) < f_calls_limit && !Optim.g_converged(res)
        #    error("Failed to solve subproblem: gnorm ≰ g_tol")
        #end
        
        gnorm = Optim.g_norm_trace(res)[end]
        
        y .= res.minimizer
        
        # Compute ε and δ
        ε = dot(Xnew, Λ)
        δ = norm(V)
        dist = norm(M.data .= Xnew .- Y)
        
        for i=1:n
            d[i] = 1 - Xnew[i,i]
        end
        Rp = norm(d)/(1 + √n)
        Rd = norm(M.data .= H2.*(Xnew .- U) .- Diagonal(y) .- Λ)
        
        if verbose
            mod(k, 20)==1 &&
            @printf("%4s %8s %8s %10s %10s %10s %10s %10s %10s %10s %10s\n", 
                "k", "f_calls", "g_calls", "||g||", "g_tol", "f(X)", "Rp", "Rd", "<X,Λ>", "||V||", "||X-Y||")
            @printf("%4d %8d %8d %10.2e %10.2e %10.2e %10.2e %10.2e %10.2e %10.2e %10.2e\n", 
                k, res.f_calls, res.g_calls, gnorm, g_tol, fvals[end], Rp, Rd, ε, δ, dist)
        end
        
        #=
        if (τ*δ)^2 + 2τ*ε*L > (1-τ)*(L*dist)^2 
            println("WARNING: (τ*δ)^2 + 2τ*ε*L ≤ (1-τ)*(L*dist)^2 fails")
            @show (τ*δ)^2 + 2τ*ε*L, (1-τ)*(L*dist)^2
        end
        =#

        # Update Y, Xold, t
        tnew = (1 + √(1 + 4t^2))/2
        Y.data .= Xnew .- (t/tnew*τ/L).*V .+ ((t - 1)/tnew).*(Xnew .- Xold)
        Xold .= Xnew
        t = tnew
    end
    #@show length(fvals), fvals[end]
    
    return Xnew, y, fvals
end

n = 6
myproj = ProjPSD(n)
U, H = randncm(n)
@time X, y, fvals = ncm2(U, H, myproj, 
    algo=BFGS(linesearch=HagerZhang()), 
    g_tol=1e-4, 
    τ=1.0, 
    f_calls_limit=200);

In [None]:
#InitialStatic()
#InitialPrevious()
#InitialQuadratic()
#InitialConstantChange(αmin = 1e-12, αmax = 1.0, α0 = 1.0, ρ = 0.25, snap2one = (0.75, Inf))
#InitialHagerZhang()

In [None]:
n = 100
myproj = ProjPSD(n)
U, H = randncm(n)
X = copy(U)

f_calls_limit=1000
tol=1e-1

alphaguess=InitialStatic(alpha=1.0)

plt = plot(yaxis=:log, legend=:topright, size=(900,600), title="subproblem solver: Optim.jl")
for algo in [BFGS] #[BFGS, LBFGS, ConjugateGradient, GradientDescent]
    for ls in [BackTracking] #[BackTracking, StrongWolfe, HagerZhang, MoreThuente, Static]
        for g_tol in [0.0]
            for τ in 0.85:0.05:0.95
                label="\\tau=$τ"
                println(label)
                @time X, y, fvals = ncm2(U, H, myproj,
                    τ=τ, 
                    f_calls_limit=f_calls_limit, 
                    tol=tol,
                    g_tol=g_tol, 
                    algo=algo(linesearch=ls(), alphaguess=alphaguess), 
                    verbose=false,
                )
                plot!(1:length(fvals), fvals, label=label)
            end
        end
    end
end
xlabel!("function evaluations"); ylabel!("objective function value")

---
# LineSearches.jl

In [None]:
n = 6
U, H = randncm(n)

H2 = Symmetric(H.*H)
    
# Loss function and gradient
f(X) = 0.5*norm(H.*(X .- U))^2
∇f(X) = Symmetric(H2.*(X .- U))

# Lipschitz constant of ∇f
L = norm(H2)

k = 0
τ = 1.0
α = 1.0
ff = 0.0
g = zeros(n)
d = similar(g)
y = zeros(n)
M = Symmetric(zeros(n,n))
gg = zeros(n)
Y = copy(M)

myproj = ProjPSD(n)

function dualobj!(ff, gg, y, M)
    copy!(M.data, Y .- (τ/L).*(∇f(Y) .+ Diagonal(y)))
    myproj(M)
    if gg != nothing
        copy!(gg, 1 .- diag(M))
    end
    if ff != nothing
        normW = norm(myproj.w[1:myproj.m[]])
        return sum(y) + (L/2τ)*(normW^2 - dot(Y,Y))
    end
end

In [None]:
k += 1

dobj = dualobj!(ff, g, y, M)
d .= -g
@show norm(g)

ϕ(t) = dualobj!(ff, nothing, y.+t.*d, M)
function dϕ(t)
    dualobj!(nothing, gg, y.+t.*d, M)
    return dot(d, gg)
end
ϕdϕ(t) = ϕ(t), dϕ(t)

α0 = α
ϕ0, dϕ0 = ϕdϕ(0.0)
α, ϕα = StrongWolfe()(ϕ, dϕ, ϕdϕ, α0, ϕ0, dϕ0)

xticks=[0.0, α, α0, 2α0]
yticks=[ϕ0, ϕ(α), ϕ(α0)]

plt = plot(ϕ, 0.0, α+10.0, 
    c=:blue, title="Iter. $k", legend=false, xticks=xticks, yticks=yticks)
plot!(t -> ϕ0 + t*dϕ0, 0.0, α+5.0, c=:red, s=:dash)
plot!(t -> ϕ(α) + (t-α)*dϕ(α), α, α+5.0, c=:black, s=:dash)
scatter!([0.0], [ϕ0], c=:red, ms=3)
scatter!([α], [ϕ(α)], c=:black, ms=3)
display(plt)

y .+= α.*d;