In [1]:
using Pkg
Pkg.activate(".")
# for pkg in ["BlackBoxOptim", "ForwardDiff", "Integrals", "Roots", "StatsPlots", "DelimitedFiles"]
#     Pkg.add(pkg)
# end
# Pkg.instantiate()

using BlackBoxOptim, Distributions, ForwardDiff, Integrals, Roots, DelimitedFiles, Random, StatProfilerHTML

[32m[1m  Activating[22m[39m project at `C:\Users\jbrig\Documents\research\mapinator_2024`


In [2]:
Random.seed!(0) # for reproducibility
#res = (;placements = placement_rates)
#res = (;placements = [496 66 33 3; 455 196 84 12; 725 618 363 43; 84 148 83 63; 126 66 41 29; 359 258 131 34; 447 206 94 18; 394 523 508 143]) # can also load an adjacency matrix directly

res = (; placements = [696   150   25   16;
1321   670  212   85;
 847   829  499  165;
  40    79   96  177;
 725   530  236  125;
 782   438  186  118;
 254   167   88   72;
 838  1003  622  477])

(placements = [696 150 25 16; 1321 670 212 85; … ; 254 167 88 72; 838 1003 622 477],)

Helper functions for the mathematics:

In [3]:
# define p_vec as e.g.
# [v2/v1 v3/v2 v4/v3 α1 α2 α3 α4 mu1 mu2 mu3 mu4 mus sg1 sg2 sg3 sg4 sgs]

function F(x, ρ, normals, K)
    return sum([ρ[i] * cdf(normals[i], x) for i in 1:K])
end

function G(Fim1, Fx, αsum)
    return (Fim1 - Fx) / αsum
end

function κ(i, t, v_rel)
    return sum([-log(v_rel[j]) for j in t:i-1])
end

function f_integrand(x, p)
    integrals = zeros(p.s * p.K)
    j = 1
    pdfs = [pdf(p.normals[i], x) for i in 1:p.K]
    base_G = G(p.Fx_vec[p.s], F(x, p.ρ, p.normals, p.K), sum(p.α[1:p.s]))
    for t in 1:p.s
        base_exp = exp(-(base_G + κ(p.s, t, p.v_rel)))
        for i in 1:p.K
            integrals[j] = base_exp * pdfs[i]
            j += 1
        end
    end
    return integrals
end

function get_integrals(Fx_vec, x_vec, ρ, normals, α, v_rel, k, K)
    # Fx_vec = [F(x0)=1, F(x1), F(x2), F(x3), ..., F(xk-1)]
    # x_vec = [x0 = 1, x1, x2, x3, ..., xk = 0]
    # x_vec[s] = x_{s-1}, so the limits of integration are and must be offset by 1 below
    all_integrals = zeros(K, k, k)
    for s in 1:k
        prob = IntegralProblem(f_integrand, x_vec[s+1], x_vec[s], (; s, Fx_vec, ρ, normals, α, v_rel, K))
        sol = solve(prob, HCubatureJL(); reltol = 1e-3, abstol = 1e-3)
        integrals_result = sol.u
        j = 1
        for t in 1:s
            for i in 1:K
                all_integrals[i, t, s] = integrals_result[j]
                j += 1
            end
        end
    end
    return all_integrals
end     

function q(i, t, all_integrals, α, k)
    return sum([(α[t]/sum(α[1:s])) * all_integrals[i, t, s] for s in t:k])
end

function Fx(t, α, v_rel)
    return 1 - sum([-log(v_rel[j])*sum(α[1:j]) for j in 1:t])
end

function pi(t, α)
    return α[t] / sum(α[1:t])
end

pi (generic function with 1 method)

In [4]:
function chisquare(p_vec, placements, k, K, M)
    v_rel = p_vec[1:k-1]
    α = 1.0 * p_vec[k:2k-1] / sum(p_vec[k:2k-1]) # change 1.0 to scale number of graduates vs departments
    μ = p_vec[2k:2k+K-1]
    σ = p_vec[2k+K:2k+2K-1]
    n = p_vec[2k+2K:2k+3K-1]
    ρ = n / sum(n)
    
    ## compute the cutoffs x and the CDF values F(x)

    normals = [truncated(Normal(μ[i], σ[i]), 0, 1) for i in 1:K]
    
    Fx_vec = ones(k) # sets F(x0) = 1 by default; Fx_vec = [F(x0)=1, F(x1), F(x2), F(x3), ..., F(xk-1)]
    x_vec = ones(k+1) # x_vec = [x0 = 1, x1, x2, x3, ..., xk = 0]
    x_vec[k+1] = 0.0
    for t in 1:k-1
        Fx_vec_candidate = Fx(t, α, v_rel)
        if Fx_vec_candidate <= 0.0 # TODO: if this case occurs, can we speed up q()?
            Fx_vec[t+1:k] .= 0.0
            x_vec[t+1:k] .= 0.0
            break
        end
        Fx_vec[t+1] = Fx_vec_candidate
        # there is no simple closed-form for F^{-1}(x) so this numerically computes x1, x2, x3
        x_vec[t+1] = find_zero(x -> F(x, ρ, normals, K) - Fx_vec[t+1], 0.5) 
    end
    
    objective = 0.0
    
    ## compute the q_i^t values: probability that a department, conditional on being in tier i, 
    ##     successfully hires an applicant from tier t.

    # TODO: div by zero and negative floating point edge cases in mean
    all_integrals = get_integrals(Fx_vec, x_vec, ρ, normals, α, v_rel, k, K)
    for i in 1:K, t in 1:k
        expectation = n[i] * q(i, t, all_integrals, α, k)
        objective += (placements[i, t] - expectation) ^ 2 / expectation
    end

    return objective
end

chisquare (generic function with 1 method)

In [5]:
NUMBER_OF_TYPES = 4
numtotal = 8

k = NUMBER_OF_TYPES
K = numtotal
M = sum(res.placements)

# upper/lower bound on the value ratios, which should all be less than 1
# if any ratio turns out to be 1.0 or close to it at optimality, this could indicate that a lower tier has a higher value than a higher one
upper = [1.0 for _ in 1:k-1] 
lower = [0.0 for _ in 1:k-1]

# upper/lower bound on variables proportionate to alpha
append!(upper, [1.0 for _ in 1:k])
append!(lower, [0.0 for _ in 1:k])

# upper/lower bound on the mu parameter of truncated normal, which is strictly within [0, 1] as the mean is greater than mu in truncated normal
append!(upper, [1.0 for _ in 1:K])
append!(lower, [0.0 for _ in 1:K])

# upper/lower bound on the sigma parameter of truncated normal
append!(upper, [5.0 for _ in 1:K])
append!(lower, [0.0 for _ in 1:K])

# upper/lower bound on n_i
append!(upper, [30000.0 for _ in 1:K])
append!(lower, [sum(res.placements[i, :]) for i in 1:K])

# @profilehtml
sol_res = bboptimize(p -> chisquare(p, res.placements, k, K, M), SearchRange = [(lower[i], upper[i]) for i in eachindex(upper)], MaxFuncEvals = 100000, TraceInterval = 5)
sol = best_candidate(sol_res)

# benchmark: 176.23s
# with tol=1e-3: 149.40s
# vector-valued integrand with no optimization: 175.10s
# with variable caching: 42.05s
# with distribution caching: 9.62s

Starting optimization with optimizer DiffEvoOpt{FitPopulation{Float64}, RadiusLimitedSelector, BlackBoxOptim.AdaptiveDiffEvoRandBin{3}, RandomBound{ContinuousRectSearchSpace}}
0.00 secs, 0 evals, 0 steps
5.00 secs, 57012 evals, 56920 steps, improv/step: 0.161 (last = 0.1607), fitness=26.879170898
10.00 secs, 91503 evals, 91419 steps, improv/step: 0.154 (last = 0.1428), fitness=21.415978833

Optimization stopped after 99917 steps and 11.47 seconds
Termination reason: Max number of function evaluations (100000) reached
Steps per second = 8709.64
Function evals per second = 8716.96
Improvements/step = Inf
Total function evaluations = 100001


Best candidate found: [0.514154, 0.59031, 0.615137, 0.90581, 0.76886, 0.553001, 0.968584, 0.999803, 0.577634, 0.401146, 0.195121, 0.436056, 0.471932, 0.252498, 0.264101, 0.398673, 0.282766, 0.20521, 0.0111713, 0.284025, 0.374591, 0.465707, 0.259308, 1283.26, 3829.43, 4508.92, 800.476, 2927.3, 2652.33, 1060.02, 5864.53]

Fitness: 20.595520225



31-element Vector{Float64}:
    0.5141540346234011
    0.5903096583937368
    0.6151374449458451
    0.9058099795551449
    0.7688599636150314
    0.5530011326186408
    0.9685842035880219
    0.9998030687524304
    0.5776337189310451
    0.40114646079216915
    0.19512058120493728
    0.43605631568770997
    0.4719318312192995
    ⋮
    0.2840251855614333
    0.3745911084008378
    0.46570745496169785
    0.25930832550030497
 1283.2631203697513
 3829.432523730666
 4508.923744803454
  800.476052796814
 2927.3005393941235
 2652.328187273871
 1060.0190775517506
 5864.526315673987

In [6]:
function print_metrics_chisquare(p_vec, placements, k, K, M)
    v_rel = p_vec[1:k-1]
    α = 1.0 * p_vec[k:2k-1] / sum(p_vec[k:2k-1]) # change 1.0 to scale number of graduates vs departments
    μ = p_vec[2k:2k+K-1]
    σ = p_vec[2k+K:2k+2K-1]
    n = p_vec[2k+2K:2k+3K-1]
    ρ = n / sum(n)

    normals = [truncated(Normal(μ[i], σ[i]), 0, 1) for i in 1:K]
    
    Fx_vec = ones(k)
    x_vec = ones(k+1)
    x_vec[k+1] = 0.0
    for t in 1:k-1
        Fx_vec_candidate = Fx(t, α, v_rel)
        if Fx_vec_candidate <= 0.0
            Fx_vec[t+1:k] .= 0.0
            x_vec[t+1:k] .= 0.0
            break
        end
        Fx_vec[t+1] = Fx_vec_candidate
        x_vec[t+1] = find_zero(x -> F(x, ρ, normals, K) - Fx_vec[t+1], 0.5) 
    end

    objective = 0.0
    normalizer = 0.0
    
    q_it = zeros(K, k)
    all_integrals = get_integrals(Fx_vec, x_vec, ρ, normals, α, v_rel, k, K)
    for i in 1:K, t in 1:k
        prob = q(i, t, all_integrals, α, k)
        q_it[i, t] = prob
        normalizer += ρ[i] * prob
    end

    round_1_failure = zeros(K)
    for i in 1:K
        round_1_failure[i] = (1 - sum([q_it[i, s] for s in 1:k]))
    end
    
    # TODO: div by zero and negative floating point in mean
    exp_placements = zeros(K, k)
    for i in 1:K, t in 1:k 
        expectation = n[i] * q_it[i, t]
        exp_placements[i, t] = expectation
        objective += (placements[i, t] - expectation) ^ 2 / expectation
    end
    
    println("objective value = ", objective)
    println("success sample size (departments) = ", M)
    println("estimated total samples (departments) = ", sum(n))
    println("estimated unmatched departments = ", sum(n) - M)
    println("estimated probability of any success: ", M / sum(n))
    println("estimated probability of no success: ", 1 - (M / sum(n)))
    println()
    println("estimated fraction of departments of each tier:")
    for i in 1:K
        println("ρ_", i, " = ", ρ[i])
    end
    println()
    println("estimated departments of each tier:")
    for i in 1:K
        println("n_", i, " = ", n[i])
        println("  Successful: ", sum(placements[i, :]))
        println("  Unsuccessful: ", n[i] - sum(placements[i, :]))
    end
    println()
    #println("fractions observed among successful departments in data:")
    #display(sum(placements, dims = 2) ./ M)
    #println()

    for i in 1:k
        println("pi_", i, " = ", pi(i, α))
    end
    println()

    offer_targets = zeros(k, k)
    for t in 1:k, j in 1:t
        offer_targets[t, j] = pi(j, α) * prod([1 - pi(i, α) for i in j+1:t])
    end
    println("Tier selection probabilities for making offers:")
    display(offer_targets)
    println()

    println("Hiring probabilities (q_i^t):")
    display(q_it)
    println()

    println("Probabilities of failing (1 - sum_t q_i^t):")
    display(round_1_failure)
    println()

    for i in 1:k+1
        println("x_", i - 1, " = ", x_vec[i])
    end
    println()
    for i in 1:k
        println("F(x_", i - 1, ") = ", Fx_vec[i])
    end
    println()
    for i in 1:k
        println("α_", i, " = ", α[i])
        println("  Est. graduates: ", α[i] * (sum(n) - 1))
        println("  Successful: ", sum(placements[:, i]))
        println("  Unsuccessful: ", (α[i] * (sum(n) - 1)) - sum(placements[:, i]))
    end
    println()
    println("Total estimated graduates: ", sum(α) * (sum(n) - 1))
    println("Total successful graduates: ", M)
    println("Total estimated unsuccessful graduates: ", (sum(α) * (sum(n) - 1)) - M)
    println()
    println("estimated placement rates:")
    display(exp_placements)
    println()
    println("actual placement rates:")
    display(placements)
    println()
    println("difference between estimated and actual placement rates:")
    display(exp_placements - placements)
    println()
    println("chi-square p-value")
    println(1 - cdf(Chisq((size(placements)[1] - 1) * (size(placements)[2] - 1)), objective))
    println()
end

print_metrics_chisquare (generic function with 1 method)

In [7]:
# ensure the correct metrics function is used based on the selected optimizer
print_metrics_chisquare(sol, res.placements, k, K, M)

for i in 1:k-1
    println("v_", i + 1, "/v_", i, " = ", sol[i])
end

v_base = 1
for i in 1:k
    println("v", i, ": ", v_base)
    if i != k
        v_base = sol[i] * v_base
    end
end

println()

for select_type in 1:k
    println("mean for type ", select_type, ": ", mean(truncated(Normal(sol[2k-1+select_type], sol[2k-1+select_type+K]), 0, 1)))
    println("stddev for type ", select_type, ": ", std(truncated(Normal(sol[2k-1+select_type], sol[2k-1+select_type+K]), 0, 1)))
    println()
end
for select_type in k+1:K
    println("mean for sink ", select_type - k, ": ", mean(truncated(Normal(sol[2k-1+select_type], sol[2k-1+select_type+K]), 0, 1)))
    println("stddev for sink ", select_type - k, ": ", std(truncated(Normal(sol[2k-1+select_type], sol[2k-1+select_type+K]), 0, 1)))
    println()
end

objective value = 20.595520225470928
success sample size (departments) = 12568
estimated total samples (departments) = 22926.269561594418
estimated unmatched departments = 10358.269561594418
estimated probability of any success: 0.5481921062750496
estimated probability of no success: 0.45180789372495045

estimated fraction of departments of each tier:
ρ_1 = 0.055973481290625904
ρ_2 = 0.16703251758609902
ρ_3 = 0.19667062418025058
ρ_4 = 0.03491523340272304
ρ_5 = 0.12768324700752332
ρ_6 = 0.11568947927389779
ρ_7 = 0.04623600340665414
ρ_8 = 0.2557994138522262

estimated departments of each tier:
n_1 = 1283.2631203697513
  Successful: 887
  Unsuccessful: 396.2631203697513
n_2 = 3829.432523730666
  Successful: 2288
  Unsuccessful: 1541.4325237306662
n_3 = 4508.923744803454
  Successful: 2340
  Unsuccessful: 2168.9237448034537
n_4 = 800.476052796814
  Successful: 392
  Unsuccessful: 408.476052796814
n_5 = 2927.3005393941235
  Successful: 1616
  Unsuccessful: 1311.3005393941235
n_6 = 2652.3281

4×4 Matrix{Float64}:
 1.0       0.0       0.0       0.0
 0.540889  0.459111  0.0       0.0
 0.406617  0.345141  0.248242  0.0
 0.283397  0.24055   0.173015  0.303037


Hiring probabilities (q_i^t):


8×4 Matrix{Float64}:
 0.54013    0.116327   0.0277185  0.00925432
 0.346526   0.172122   0.0578898  0.0215909
 0.182001   0.197936   0.0996499  0.0399291
 0.0579468  0.0956635  0.116559   0.221357
 0.247861   0.178654   0.0817063  0.0427034
 0.295068   0.162305   0.0724696  0.0439292
 0.240752   0.155273   0.0846148  0.0676648
 0.144977   0.166135   0.109336   0.0810258


Probabilities of failing (1 - sum_t q_i^t):


8-element Vector{Float64}:
 0.30657000971465964
 0.40187131563044387
 0.48048450954439126
 0.5084735539840186
 0.44907563908294534
 0.4262279842202772
 0.4516950237863232
 0.4985254641426784


x_0 = 1.0
x_1 = 0.6725980061595509
x_2 = 0.4451007668644862
x_3 = 0.20266183831789475
x_4 = 0.0

F(x_0) = 1.0
F(x_1) = 0.8114749680460555
F(x_2) = 0.5352980341309334
F(x_3) = 0.19663714760289586

α_1 = 0.283397257221503
  Est. graduates: 6496.9585148194665
  Successful: 5503
  Unsuccessful: 993.9585148194665
α_2 = 0.24055023657714003
  Est. graduates: 5514.679016636244
  Successful: 3866
  Unsuccessful: 1648.6790166362443
α_3 = 0.17301532083083715
  Est. graduates: 3966.4228683326833
  Successful: 1964
  Unsuccessful: 2002.4228683326833
α_4 = 0.3030371853705199
  Est. graduates: 6947.209161806025
  Successful: 1235
  Unsuccessful: 5712.209161806025

Total estimated graduates: 22925.269561594418
Total successful graduates: 12568
Total estimated unsuccessful graduates: 10357.269561594418

estimated placement rates:


8×4 Matrix{Float64}:
  693.129  149.278    35.5702   11.8757
 1327.0    659.13    221.685    82.6809
  820.627  892.478   449.314   180.037
   46.385   76.5763   93.3025  177.191
  725.563  522.973   239.179   125.006
  782.617  430.487   192.213   116.515
  255.202  164.593    89.6933   71.7259
  850.224  974.303   641.205   475.178


actual placement rates:


8×4 Matrix{Int64}:
  696   150   25   16
 1321   670  212   85
  847   829  499  165
   40    79   96  177
  725   530  236  125
  782   438  186  118
  254   167   88   72
  838  1003  622  477


difference between estimated and actual placement rates:


8×4 Matrix{Float64}:
  -2.87069    -0.722055   10.5702   -4.12428
   5.99784   -10.8702      9.6849   -2.31905
 -26.3733     63.4779    -49.6861   15.0373
   6.38501    -2.42367    -2.69747   0.191275
   0.562939   -7.02656     3.17899   0.00580697
   0.617204   -7.51326     6.21313  -1.48538
   1.20199    -2.40748     1.6933   -0.27407
  12.2241    -28.697      19.2053   -1.82185


chi-square p-value
0.48387102982785446

v_2/v_1 = 0.5141540346234011
v_3/v_2 = 0.5903096583937368
v_4/v_3 = 0.6151374449458451
v1: 1
v2: 0.5141540346234011
v3: 0.3035100925403015
v4: 0.18670042284051805

mean for type 1: 0.6917928364836161
stddev for type 1: 0.22392245431369764

mean for type 2: 0.5524440119250077
stddev for type 2: 0.23092747594049745

mean for type 3: 0.4124079597512885
stddev for type 3: 0.19046305573527805

mean for type 4: 0.19512058120493728
stddev for type 4: 0.011171308259030512

mean for sink 1: 0.45693557813302466
stddev for sink 1: 0.2320852921330966

mean for sink 2: 0.4869306713453784
stddev for sink 2: 0.25550422597238753

mean for sink 3: 0.41993954859567845
stddev for sink 3: 0.2607174656974663

mean for sink 4: 0.3349255141016291
stddev for sink 4: 0.20330555976385428

