In [7]:
const chol_lib = "./chol.so"
function call_chol(D::Array{Float64,2}, V::Array{Float64,2}, w::Array{Float64,1}, m::Int, n::Int)
    u = Array{Float64,1}(undef,n)
    run_time = Array{Float64,}(undef,1)
    ccall(
        (:chol, chol_lib),
        Cvoid,
        (Ptr{Cdouble}, Ptr{Cdouble}, Ptr{Cdouble}, Cint, Cint, Ptr{Cdouble}, Ptr{Cdouble}),
        D, V, w, m, n, u, run_time
    )
    return u, run_time    
end

const prodLDL_lib = "./prodLDL.so"
function call_prodLDL(D::Array{Float64,2}, V::Array{Float64,2}, w::Array{Float64,1}, m::Int, n::Int)
    u = Array{Float64,1}(undef,n)
    run_time = Array{Float64,}(undef,1)
    ccall(
        (:prodLDL, prodLDL_lib),
        Cvoid,
        (Ptr{Cdouble}, Ptr{Cdouble}, Ptr{Cdouble}, Cint, Cint, Ptr{Cdouble}, Ptr{Cdouble}),
        D, V, w, m, n, u, run_time
    )
    return u, run_time    
end


const vecLDL_AVX_OpenMP_lib = "./vecLDL_AVX_OpenMP.so"
function call_vecLDL_AVX_OpenMP(D::Array{Float64,2}, V::Array{Float64,2}, w::Array{Float64,1}, m::Int, n::Int)
    u = Array{Float64,1}(undef,n)
    run_time = Array{Float64,}(undef,1)
    ccall(
        (:vecLDL_AVX_OpenMP, vecLDL_AVX_OpenMP_lib),
        Cvoid,
        (Ptr{Cdouble}, Ptr{Cdouble}, Ptr{Cdouble}, Cint, Cint, Ptr{Cdouble}, Ptr{Cdouble}),
        D, V, w, m, n, u, run_time
    )
    return u, run_time    
end

const vecLDL_AVX_Unroll_lib = "./vecLDL_AVX_Unroll.so"
function call_vecLDL_AVX_Unroll(D::Array{Float64,2}, V::Array{Float64,2}, w::Array{Float64,1}, m::Int, n::Int)
    u = Array{Float64,1}(undef,n)
    run_time = Array{Float64,}(undef,1)
    ccall(
        (:vecLDL_AVX_Unroll, vecLDL_AVX_Unroll_lib),
        Cvoid,
        (Ptr{Cdouble}, Ptr{Cdouble}, Ptr{Cdouble}, Cint, Cint, Ptr{Cdouble}, Ptr{Cdouble}),
        D, V, w, m, n, u, run_time
    )
    return u, run_time    
end

call_vecLDL_AVX_Unroll (generic function with 1 method)

In [8]:
using LinearAlgebra
m = 50 # when m >=150, the vecLDL_AVX_OpenMP is more efficient than the vecLDL_AVX_Unroll 
n = 16*m
D = Matrix(Diagonal(rand(1:1e3,n)))
V = rand(n,m)
w = 1.0e3*(2.0*rand(n).-1.0)
u = (D+V*V')\w;

sleep(1)
u_chol, run_time_chol = call_chol(D,V,w,m,n)
println("chol takes $run_time_chol seconds")


sleep(1)
u_prodLDL, run_time_prodLDL = call_prodLDL(D,V,w,m,n)
println("prodLDL takes $run_time_prodLDL seconds")

sleep(1)
u_vecLDL_AVX_OpenMP, run_time_vecLDL_AVX_OpenMP = call_vecLDL_AVX_OpenMP(D,V,w,m,n)
println("vecLDL_AVX_OpenMP takes $run_time_vecLDL_AVX_OpenMP seconds")

sleep(1)
u_vecLDL_AVX_Unroll, run_time_vecLDL_AVX_Unroll = call_vecLDL_AVX_Unroll(D,V,w,m,n)
println("vecLDL_AVX_Unroll takes $run_time_vecLDL_AVX_Unroll seconds")

rel_err_chol = norm(u_chol-u)/norm(u)
println("chol has the relative error $rel_err_chol")

rel_err_prodLDL = norm(u_prodLDL-u)/norm(u)
println("prodLDL has the relative error $rel_err_prodLDL")

rel_err_vecLDL_AVX_OpenMP = norm(u_vecLDL_AVX_OpenMP-u)/norm(u)
println("vecLDL_AVX_OpenMP has the relative error $rel_err_vecLDL_AVX_OpenMP")

rel_err_vecLDL_AVX_Unroll= norm(u_vecLDL_AVX_Unroll-u)/norm(u)
println("vecLDL_AVX_Unroll has the relative error $rel_err_vecLDL_AVX_Unroll")

chol takes [0.045838536] seconds
prodLDL takes [0.003928442] seconds
vecLDL_AVX_OpenMP takes [0.003588751] seconds
vecLDL_AVX_Unroll takes [0.001376369] seconds
chol has the relative error 5.905318106074778e-15
prodLDL has the relative error 3.628436964376391e-15
vecLDL_AVX_OpenMP has the relative error 4.4620611378547815e-15
vecLDL_AVX_Unroll has the relative error 4.4620611378547815e-15
