# Structural Estimation of Market Parameters

### Simultaneous Estimator, Truncated Normal with Variance

In [1]:
using Pkg
Pkg.activate(".")
#for pkg in ["BlackBoxOptim", "Cubature", "Distributions", "Integrals", "Roots", "FiniteDiff", "PythonPlot", "PrettyTables", "JLD", "StatProfilerHTML"] # 
#    Pkg.add(pkg)
#end

#Pkg.instantiate()

using BlackBoxOptim, Cubature, Distributions, Integrals, Random, Roots, FiniteDiff, PrettyTables, JLD

[32m[1m  Activating[22m[39m project at `C:\Users\jbrig\Documents\research\mapinator_2024`


## 1. Load placements

In [2]:
placement_matrices = load("placement_matrices.jld")

Dict{String, Any} with 4 entries:
  "successful_tier_1_range" => 2018:2023
  "placement_matrix"        => [761 115 … 8 0; 879 569 … 48 4; … ; 165 244 … 20…
  "successful_tier_1"       => 2098
  "unmatched"               => [300, 544, 667, 364, 273]

In [3]:
placement_rates_raw = placement_matrices["placement_matrix"]

10×5 Matrix{Int64}:
 761  115   41    8    0
 879  569  148   48    4
 839  993  443   60   14
 176  306  217  140   11
   3   14   16   15   46
 582  596  301   79   19
 700  438  244   68   24
 218  187  114   39    9
 165  244  315  201  104
 676  967  815  309  123

In [4]:
unmatched_raw = placement_matrices["unmatched"]

5-element Vector{Int64}:
 300
 544
 667
 364
 273

In [5]:
k = size(placement_rates_raw)[2]
K = size(placement_rates_raw)[1]

10

## 2. Rescale placements (successes and failures) using registrations

In [6]:
# conduct adjustment using the relative m_t values
adjustment_matrix = load("adjustment_matrix.jld")
m_t_values = adjustment_matrix["data"][:, 1]

5-element Vector{Any}:
 2664
 3635
 4481
 2284
 1347

In [7]:
m_t_placements = zeros(Int, k)
for t in 1:k
    m_t_placements[t] = sum(placement_rates_raw[:, t]) + unmatched_raw[t]
end
m_t_placements

5-element Vector{Int64}:
 5299
 4973
 3321
 1331
  627

In [8]:
placement_rates_rescaled = zeros(Int, K, k)
unmatched_rescaled = zeros(Int, k)

for i in 1:K
    for t in 1:k
        placement_rates_rescaled[i, t] = round(m_t_values[t] * (placement_rates_raw[i, t] / m_t_placements[t]))
    end
end

for t in 1:k
    unmatched_rescaled[t] = round(m_t_values[t] * (unmatched_raw[t] / m_t_placements[t]))
end

placement_rates_rescaled

10×5 Matrix{Int64}:
 383   84    55   14    0
 442  416   200   82    9
 422  726   598  103   30
  88  224   293  240   24
   2   10    22   26   99
 293  436   406  136   41
 352  320   329  117   52
 110  137   154   67   19
  83  178   425  345  223
 340  707  1100  530  264

In [9]:
unmatched_rescaled

5-element Vector{Int64}:
 151
 398
 900
 625
 586

## 3. Collapse sinks and delete graduating tier 5 (tier 5 becomes part of the sinks)

In [10]:
println("original dimensions: ", K, " by ", k)

original dimensions: 10 by 5


In [11]:
k -= 1
K = k + 3

7

In [12]:
println("new dimensions: ", K, " by ", k)

new dimensions: 7 by 4


In [13]:
placement_rates = zeros(Int, K, k)
for i in 1:4 # NOTE: hardcoded
    for t in 1:k
        placement_rates[i, t] = placement_rates_rescaled[i, t]
    end
end

for i in 5:6 # NOTE: hardcoded
    for t in 1:k
        placement_rates[i, t] = placement_rates_rescaled[i+1, t]
    end
end

for t in 1:k
    # NOTE: hardcoded
    for original_sink_tier in [5, 8, 9, 10]
        placement_rates[K, t] += placement_rates_rescaled[original_sink_tier, t] 
    end
end

unmatched = zeros(Int, k)
for t in 1:k
    unmatched[t] = unmatched_rescaled[t]
end

placement_rates

7×4 Matrix{Int64}:
 383    84    55   14
 442   416   200   82
 422   726   598  103
  88   224   293  240
 293   436   406  136
 352   320   329  117
 535  1032  1701  968

In [14]:
unmatched

4-element Vector{Int64}:
 151
 398
 900
 625

## 3a. Latex version of the new table

In [15]:
presentation_table = zeros(Int, K+1, k)
for i in 1:K, t in 1:k
    presentation_table[i, t] = placement_rates[i, t]
end

for t in 1:k
    presentation_table[K+1, t] = unmatched[t]
end

In [16]:
include("type_allocation_base_by_applicant.jl")

Main.SBM

In [17]:
presentation_table

8×4 Matrix{Int64}:
 383    84    55   14
 442   416   200   82
 422   726   598  103
  88   224   293  240
 293   436   406  136
 352   320   329  117
 535  1032  1701  968
 151   398   900  625

In [18]:
sink_names_to_present = [
    "Public Sector",
    "Private Sector",
    "Other",
    "Unmatched"
]
SBM.nice_table(presentation_table, k, (K - k) + 1, sink_names_to_present)

┌────────────────┬────────┬────────┬────────┬────────┬────────────┐
│[1m                [0m│[1m Tier 1 [0m│[1m Tier 2 [0m│[1m Tier 3 [0m│[1m Tier 4 [0m│[1m Row Totals [0m│
├────────────────┼────────┼────────┼────────┼────────┼────────────┤
│         Tier 1 │    383 │     84 │     55 │     14 │        536 │
│         Tier 2 │    442 │    416 │    200 │     82 │       1140 │
│         Tier 3 │    422 │    726 │    598 │    103 │       1849 │
│         Tier 4 │     88 │    224 │    293 │    240 │        845 │
│  Public Sector │    293 │    436 │    406 │    136 │       1271 │
│ Private Sector │    352 │    320 │    329 │    117 │       1118 │
│          Other │    535 │   1032 │   1701 │    968 │       4236 │
│      Unmatched │    151 │    398 │    900 │    625 │       2074 │
│  Column Totals │   2666 │   3636 │   4482 │   2285 │      13069 │
└────────────────┴────────┴────────┴────────┴────────┴────────────┘
\begin{tabular}{rrrrrr}
  \hline
   & \textbf{Tier 1} & \textbf{Tier

## 4. Compute $m$ and $\gamma_t$

In [19]:
res = (; placements = placement_rates) 

M = sum(res.placements)

successful_graduates = [sum(res.placements[:, t]) for t in 1:k]
println("successful graduates: ", M)
println(successful_graduates)
println()

estimated_m_val = sum(m_t_values[1:k])

successful graduates: 10995
[2515, 3238, 3582, 1660]



13064

In [20]:
estimated_γ = m_t_values[1:k] / estimated_m_val

4-element Vector{Float64}:
 0.2039191671769749
 0.27824556031843234
 0.3430036742192284
 0.17483159828536435

In [21]:
function Fx(t, α, v_rel)
    return 1 - sum([-log(v_rel[j])*sum(α[1:j]) for j in 1:t])
end

function F(x, ρ, normals, K)
    sum_base = 0.0
    for i in 1:K
        sum_base += ρ[i] * cdf(normals[i], x)
    end
    return sum_base
end

function f_integrand(integrals, x, p) 
    base_exp = exp(F(x, p.ρ, p.normals, p.K) / sum(p.α[1:p.s]))
    for i in 1:p.K
        integrals[i] = base_exp * pdf(p.normals[i], x)
    end
end

function get_integrals(x_vec, ρ, normals, α, k, K)
    # Fx_vec = [F(x0)=1, F(x1), F(x2), F(x3), ..., F(xk-1)]
    # x_vec = [x0 = 1, x1, x2, x3, ..., xk = 0]
    # x_vec[s] = x_{s-1}, so the limits of integration are and must be offset by 1 below
    all_integrals = zeros(K, k)
    for s in 1:k
        # https://docs.sciml.ai/Integrals/stable/basics/SampledIntegralProblem/ might be faster
        # if f_integrand(x) can be vectorized/sped up
        prob = IntegralProblem(IntegralFunction(f_integrand, zeros(K)), (x_vec[s+1], x_vec[s]), (; s, ρ, normals, α, K))
        sol = solve(prob, CubatureJLh(); reltol = 1e-3, abstol = 1e-3)
        integrals_result = sol.u
        # NOTE: result may be inf if alpha_1 is too small
        # NOTE: some parameter values for μ and σ may cause the cdf F_i to be NaN
        for i in 1:K 
            all_integrals[i, s] = integrals_result[i]
        end
    end
    return all_integrals
end      

function q(i, t, all_integrals, Fx_vec, α, v_rel, k)
    return α[t] * sum([(1/sum(α[1:s])) * prod([v_rel[j] for j in t:(s-1)]) * exp(-Fx_vec[s] / sum(α[1:s])) * all_integrals[i, s] for s in t:k])
end

q (generic function with 1 method)

In [22]:
function estimate_likelihood_2(p_vec, γ, placements, k, K, counter)   
    μ = p_vec[1:K]
    σ = p_vec[K+1:2K]
    ρ_vec = p_vec[2K+1:3K]
    ρ = ρ_vec / sum(ρ_vec)

    v_rel = p_vec[3K+1:3K+k-1]
    τ = p_vec[3K+k]
    α = γ * τ

    ## compute the cutoffs x and the CDF values F(x)
    normals = [truncated(Normal(μ[i], σ[i]), 0, 1) for i in 1:K]

    Fx_vec = ones(k) # sets F(x0) = 1 by default; Fx_vec = [F(x0)=1, F(x1), F(x2), F(x3), ..., F(xk-1)]
    x_vec = ones(k+1) # x_vec = [x0 = 1, x1, x2, x3, ..., xk = 0]
    x_vec[k+1] = 0.0
    for t in 1:k-1
        Fx_vec_candidate = Fx(t, α, v_rel)
        if Fx_vec_candidate <= 0.0 # TODO: if this case occurs, can we speed up q()?
            Fx_vec[t+1:k] .= 0.0
            x_vec[t+1:k] .= 0.0
            break
        end
        Fx_vec[t+1] = Fx_vec_candidate
        # there is no simple closed-form for F^{-1}(x) so this numerically computes x1, x2, x3
        x_vec[t+1] = find_zero(x -> F(x, ρ, normals, K) - Fx_vec[t+1], 0.5) 
    end 

    ρ_q_it = zeros(K, k)
    all_integrals = get_integrals(x_vec, ρ, normals, α, k, K)
    for i in 1:K, t in 1:k
        prob = q(i, t, all_integrals, Fx_vec, α, v_rel, k)
        ρ_q_it[i, t] = ρ[i] * prob
    end

    normalizer = sum(ρ_q_it)
    likelihood = 0.0
    for i in 1:K, t in 1:k
        likelihood += placements[i, t] * log(ρ_q_it[i, t] / normalizer)
    end

    counter[1] += 1
    #if isnan(-likelihood) || isinf(-likelihood)
    #    println(all_integrals)
    #end
    return -likelihood
end

estimate_likelihood_2 (generic function with 1 method)

In [23]:
Random.seed!(0)

# upper/lower bound on the mu parameter of truncated normal
upper_1 = [4.0 for _ in 1:K]
lower_1 = [-4.0 for _ in 1:K]

# upper/lower bound on the sigma parameter of truncated normal
append!(upper_1, [10.0 for _ in 1:K])
append!(lower_1, [0.12 for _ in 1:K])

# upper/lower bound on values proportional to ρ_i
append!(upper_1, [1.0 for _ in 1:K])
append!(lower_1, [0.0 for _ in 1:K])

# upper/lower bound on v_rel
append!(upper_1, [1.0 for _ in 1:k-1])
append!(lower_1, [0.0001 for _ in 1:k-1])

# upper/lower bound on τ
append!(upper_1, [10.0 for _ in 1:1])
append!(lower_1, [0.25 for _ in 1:1])

search_range_1 = [(lower_1[i], upper_1[i]) for i in eachindex(upper_1)]
counter = [0]
sol_res_1 = bboptimize(p -> estimate_likelihood_2(p, estimated_γ, res.placements, k, K, counter), SearchRange = search_range_1, MaxFuncEvals = 1000000, TraceInterval = 5) 
# MaxTime = 60.0, MaxFuncEvals = 500000,
sol_1 = best_candidate(sol_res_1)

Starting optimization with optimizer DiffEvoOpt{FitPopulation{Float64}, RadiusLimitedSelector, BlackBoxOptim.AdaptiveDiffEvoRandBin{3}, RandomBound{ContinuousRectSearchSpace}}
0.00 secs, 0 evals, 0 steps
5.00 secs, 36061 evals, 35996 steps, improv/step: 0.168 (last = 0.1679), fitness=32973.232113007
10.00 secs, 49880 evals, 49816 steps, improv/step: 0.157 (last = 0.1291), fitness=32955.174785747
15.01 secs, 60605 evals, 60542 steps, improv/step: 0.143 (last = 0.0778), fitness=32953.068546136
20.01 secs, 69948 evals, 69887 steps, improv/step: 0.135 (last = 0.0843), fitness=32951.268841479
25.01 secs, 89910 evals, 89853 steps, improv/step: 0.130 (last = 0.1134), fitness=32950.304563676
30.01 secs, 116912 evals, 116860 steps, improv/step: 0.126 (last = 0.1135), fitness=32949.872470094
35.01 secs, 159047 evals, 158999 steps, improv/step: 0.124 (last = 0.1187), fitness=32949.256561193
40.02 secs, 227253 evals, 227211 steps, improv/step: 0.127 (last = 0.1319), fitness=32949.247787355
45.02 s

25-element Vector{Float64}:
  3.9999999999751794
  0.30218723471728925
  0.29002218954441433
 -0.9950554866470221
  0.16036886184425492
 -1.6864828563188423
  0.037579602143257444
  2.285704998718138
  0.24361646350630772
  0.12000000000001639
  0.12000000105376678
  0.2059137972911278
  0.7548998541809302
  0.12000000000002958
  0.06862449169088886
  0.16844407984663684
  0.2780475138256293
  0.2808693037722956
  0.2142173808334343
  0.19325959897661246
  0.9326663872843058
  0.6953793932131229
  0.7284879812397554
  0.7375172104690145
  0.6555941144733998

In [24]:
estimated_v_rel = sol_1[3K+1:3K+k-1]
estimated_τ = sol_1[3K+k]

0.6555941144733998

In [25]:
println("estimated value ratios:")
display(estimated_v_rel)
println()
estimated_v_base = 1
for i in 1:k
    println("v", i, ": ", estimated_v_base)
    if i != k
        estimated_v_base = estimated_v_rel[i] * estimated_v_base
    end
end
println()

println("estimated market balance ratio (m / (n-1)): ", estimated_τ)

#estimated_n_val = (estimated_m_val / estimated_τ) + 1
#println("estimated total departments: ", estimated_n_val)
estimated_α = estimated_τ * estimated_γ
println("estimated α_t: ", estimated_α)

estimated value ratios:


3-element Vector{Float64}:
 0.6953793932131229
 0.7284879812397554
 0.7375172104690145


v1: 1
v2: 0.6953793932131229
v3: 0.506575530357554
v4: 0.3736081720411648

estimated market balance ratio (m / (n-1)): 0.6555941144733998
estimated α_t: [0.13368820582954202, 0.1824161517231176, 0.22487119006087755, 0.1146185668598626]


In [26]:
estimated_μ = sol_1[1:K]
estimated_σ = sol_1[K+1:2K]
est_ρ_vec = sol_1[2K+1:3K]
estimated_ρ = est_ρ_vec / sum(est_ρ_vec)

7-element Vector{Float64}:
 0.03212563451091255
 0.07885483464205365
 0.13016421084858856
 0.13148519392998606
 0.10028299099887637
 0.09047188677788753
 0.43661524829169535

In [27]:
function estimate_likelihood_2_for_variance(p_vec, γ, placements, k, K, show_m = false)   
    μ = p_vec[1:K]
    σ = p_vec[K+1:2K]
    ρ = p_vec[2K+1:3K]

    v_rel = p_vec[3K+1:3K+k-1]
    τ = p_vec[3K+k]
    α = γ * τ

    ## compute the cutoffs x and the CDF values F(x)
    normals = [truncated(Normal(μ[i], σ[i]), 0, 1) for i in 1:K]

    Fx_vec = ones(k) # sets F(x0) = 1 by default; Fx_vec = [F(x0)=1, F(x1), F(x2), F(x3), ..., F(xk-1)]
    x_vec = ones(k+1) # x_vec = [x0 = 1, x1, x2, x3, ..., xk = 0]
    x_vec[k+1] = 0.0
    for t in 1:k-1
        Fx_vec_candidate = Fx(t, α, v_rel)
        if Fx_vec_candidate <= 0.0 # TODO: if this case occurs, can we speed up q()?
            Fx_vec[t+1:k] .= 0.0
            x_vec[t+1:k] .= 0.0
            break
        end
        Fx_vec[t+1] = Fx_vec_candidate
        # there is no simple closed-form for F^{-1}(x) so this numerically computes x1, x2, x3
        x_vec[t+1] = find_zero(x -> F(x, ρ, normals, K) - Fx_vec[t+1], 0.5) 
    end 

    ρ_q_it = zeros(K, k)
    all_integrals = get_integrals(x_vec, ρ, normals, α, k, K)
    for i in 1:K, t in 1:k
        prob = q(i, t, all_integrals, Fx_vec, α, v_rel, k)
        ρ_q_it[i, t] = ρ[i] * prob
    end

    normalizer = sum(ρ_q_it)
    if show_m
        println("n: ", sum(placements) / normalizer)
        println("m: ", τ * ((sum(placements) / normalizer) - 1))
    end
    
    likelihood_probability_vector = zeros(k * K)
    total_placements_vector = zeros(k * K)
    vector_counter = 1
    for i in 1:K
        for t in 1:k
            likelihood_probability_vector[vector_counter] = (ρ_q_it[i, t] / normalizer)
            total_placements_vector[vector_counter] = placements[i, t]
            vector_counter += 1
        end
    end

    actual_likelihood = loglikelihood(Multinomial(M, likelihood_probability_vector), total_placements_vector)
    return actual_likelihood
end

estimate_likelihood_2_for_variance (generic function with 2 methods)

In [28]:
sol_1_for_variance = zeros(3K + k)
sol_1_for_variance[1:K] = estimated_μ
sol_1_for_variance[K+1:2K] = estimated_σ
sol_1_for_variance[2K+1:3K] = estimated_ρ
sol_1_for_variance[3K+1:3K+k-1] = estimated_v_rel
sol_1_for_variance[3K+k] = estimated_τ
sol_1_for_variance

25-element Vector{Float64}:
  3.9999999999751794
  0.30218723471728925
  0.29002218954441433
 -0.9950554866470221
  0.16036886184425492
 -1.6864828563188423
  0.037579602143257444
  2.285704998718138
  0.24361646350630772
  0.12000000000001639
  0.12000000105376678
  0.2059137972911278
  0.7548998541809302
  0.12000000000002958
  0.03212563451091255
  0.07885483464205365
  0.13016421084858856
  0.13148519392998606
  0.10028299099887637
  0.09047188677788753
  0.43661524829169535
  0.6953793932131229
  0.7284879812397554
  0.7375172104690145
  0.6555941144733998

In [29]:
estimate_likelihood_2_for_variance(sol_1_for_variance, estimated_γ, res.placements, k, K, true)

n: 21509.765105789513
m: 14101.019812946437


-112.72556205940418

In [30]:
likelihood_hessian = FiniteDiff.finite_difference_hessian(p -> estimate_likelihood_2_for_variance(p, estimated_γ, res.placements, k, K), sol_1_for_variance)

25×25 LinearAlgebra.Symmetric{Float64, Matrix{Float64}}:
   -0.782598      10.4674     …     -0.270985        8.87827
   10.4674     -4824.19            216.911         495.882
   13.7087      3250.4            2782.43        -2223.76
 3727.88      424023.0               3.63545e6      -1.22469e7
    3.57472      855.237             8.12067       678.276
    0.684861     108.63       …    -34.5997        154.763
   -0.290091     444.052         -2055.14         -537.546
    2.37516      -31.4819            0.84042       -27.5887
   16.1034     -3884.85            -84.7104       -194.872
    9.65041     2100.84           -697.796       -1327.13
 3719.45           4.22908e5  …      3.62883e6      -1.22006e7
    8.14189     1641.48             90.6688        666.83
    3.94291      613.475          -180.242         833.56
   -0.641809    1039.78          -5249.46         -218.125
 -180.633        758.104          1057.62         3748.03
    1.21431    -9981.64       …   1046.47        -47

In [31]:
estimator_variance = inv((-1 * likelihood_hessian))

25×25 LinearAlgebra.Symmetric{Float64, Matrix{Float64}}:
 144.62          0.29512       0.246443     …   0.0212805     0.0172418
   0.29512       0.0192983     0.0133937        0.00206248    0.00199769
   0.246443      0.0133937     0.00981409       0.00141111    0.00151692
   5.42319e-8   -1.19673e-8   -1.24896e-8       1.65184e-8    2.85302e-8
   0.223896      0.0209974     0.0148077        0.0022435     0.00232819
   1.03611       0.86904       0.586765     …   0.0777463     0.0794091
  -0.0409561     0.00787579    0.00606912       0.00104414    0.0100787
  41.8226       -0.106938     -0.0731462       -0.0129025    -0.0142849
   0.134291     -0.00157589   -0.000270054     -0.000365082  -0.000312076
   0.0823872     0.00103891    0.000905471     -7.11175e-5   -7.6622e-5
   4.85613e-8   -1.24687e-8   -1.28851e-8   …   1.66248e-8    2.86376e-8
   0.109971     -0.00139949   -0.00061641      -0.000270965  -0.000372995
   0.24961      -0.138571     -0.0923059       -0.0123011    -0.012828

In [32]:
println([estimator_variance[i, i] for i in 1:3K+k]) # variance estimates

[144.61985953992237, 0.01929831068796792, 0.009814087572233801, 7.1248474422429995e-12, 0.027456533271218334, 73.27226857622561, 0.06312834427449932, 14.746111140032031, 0.002824511275983939, 0.0007141672255513622, 7.123026268521696e-12, 0.0028061498115818403, 2.1591657317399746, 0.01002331746031883, 3.654720111913517e-6, 1.2555139432970545e-5, 1.7740406218748176e-5, 9.777207391558121e-6, 3.0950688396778336e-5, 3.347318207323866e-5, 0.0003512128770310934, 0.001088399176626173, 0.0012869821223731128, 0.0011367578748721705, 0.0021110111772602346]


In [33]:
println([sqrt(estimator_variance[i, i]) for i in 1:3K+k]) # standard deviations

[12.02579974637539, 0.1389183597944056, 0.09906607679843692, 2.6692409861687273e-6, 0.16570013057091518, 8.559922229566435, 0.2512535457948789, 3.8400665541149195, 0.05314613133600544, 0.026723907378064352, 2.66889982362053e-6, 0.05297310460584541, 1.4694099944331311, 0.10011651941772062, 0.001911732228088839, 0.003543323218811762, 0.004211936160336262, 0.003126852633489164, 0.005563334287707178, 0.0057856012715394286, 0.018740674401714934, 0.03299089535957114, 0.03587453306139486, 0.03371584011814285, 0.04594574166623317]


In [34]:
println("standard deviations:")
println("      μ: ", [sqrt(estimator_variance[i, i]) for i in 1:K])
println("      σ: ", [sqrt(estimator_variance[i, i]) for i in K+1:2K])
println("      ρ: ", [sqrt(estimator_variance[i, i]) for i in 2K+1:3K])
println("  v_rel: ", [sqrt(estimator_variance[i, i]) for i in 3K+1:3K+k-1])
println("      τ: ", sqrt(estimator_variance[3K+k, 3K+k]))

standard deviations:
      μ: [12.02579974637539, 0.1389183597944056, 0.09906607679843692, 2.6692409861687273e-6, 0.16570013057091518, 8.559922229566435, 0.2512535457948789]
      σ: [3.8400665541149195, 0.05314613133600544, 0.026723907378064352, 2.66889982362053e-6, 0.05297310460584541, 1.4694099944331311, 0.10011651941772062]
      ρ: [0.001911732228088839, 0.003543323218811762, 0.004211936160336262, 0.003126852633489164, 0.005563334287707178, 0.0057856012715394286, 0.018740674401714934]
  v_rel: [0.03299089535957114, 0.03587453306139486, 0.03371584011814285]
      τ: 0.04594574166623317
