# Structural Estimation of Market Parameters

### Sequential Estimator, Truncated Normal

### Bootstrap Resampling

In [1]:
using Pkg
Pkg.activate(".")
#for pkg in ["BlackBoxOptim", "Cubature", "Dates", "Distributions", "Integrals", "Roots", "JLD", "StatProfilerHTML"] # 
#    Pkg.add(pkg)
#end

#Pkg.instantiate()

using Base.Threads, BlackBoxOptim, Cubature, Dates, Distributions, Integrals, Random, Roots, JLD

[32m[1m  Activating[22m[39m project at `C:\Users\jbrig\Documents\research\mapinator_2024`


## 1. Load placements

In [2]:
placement_matrices = load("placement_matrices.jld")

Dict{String, Any} with 4 entries:
  "successful_tier_1_range" => 2018:2023
  "placement_matrix"        => [761 115 … 8 0; 879 569 … 48 4; … ; 165 244 … 20…
  "successful_tier_1"       => 2098
  "unmatched"               => [300, 544, 667, 364, 273]

In [3]:
placement_rates_raw = placement_matrices["placement_matrix"]

10×5 Matrix{Int64}:
 761  115   41    8    0
 879  569  148   48    4
 839  993  443   60   14
 176  306  217  140   11
   3   14   16   15   46
 582  596  301   79   19
 700  438  244   68   24
 218  187  114   39    9
 165  244  315  201  104
 676  967  815  309  123

In [4]:
unmatched_raw = placement_matrices["unmatched"]

5-element Vector{Int64}:
 300
 544
 667
 364
 273

In [5]:
k = size(placement_rates_raw)[2]
K = size(placement_rates_raw)[1]

10

## 2. Rescale placements (successes and failures) using registrations

In [6]:
# conduct adjustment using the relative m_t values
adjustment_matrix = load("adjustment_matrix.jld")
m_t_values = adjustment_matrix["data"][:, 1]

5-element Vector{Any}:
 2664
 3635
 4481
 2284
 1347

In [7]:
m_t_placements = zeros(Int, k)
for t in 1:k
    m_t_placements[t] = sum(placement_rates_raw[:, t]) + unmatched_raw[t]
end
m_t_placements

5-element Vector{Int64}:
 5299
 4973
 3321
 1331
  627

In [8]:
placement_rates_rescaled = zeros(Int, K, k)
unmatched_rescaled = zeros(Int, k)

for i in 1:K
    for t in 1:k
        placement_rates_rescaled[i, t] = round(m_t_values[t] * (placement_rates_raw[i, t] / m_t_placements[t]))
    end
end

for t in 1:k
    unmatched_rescaled[t] = round(m_t_values[t] * (unmatched_raw[t] / m_t_placements[t]))
end

placement_rates_rescaled

10×5 Matrix{Int64}:
 383   84    55   14    0
 442  416   200   82    9
 422  726   598  103   30
  88  224   293  240   24
   2   10    22   26   99
 293  436   406  136   41
 352  320   329  117   52
 110  137   154   67   19
  83  178   425  345  223
 340  707  1100  530  264

In [9]:
unmatched_rescaled

5-element Vector{Int64}:
 151
 398
 900
 625
 586

## 3. Collapse sinks and delete graduating tier 5 (tier 5 becomes part of the sinks)

In [10]:
println("original dimensions: ", K, " by ", k)

original dimensions: 10 by 5


In [11]:
k -= 1
K = k + 3

7

In [12]:
println("new dimensions: ", K, " by ", k)

new dimensions: 7 by 4


In [13]:
placement_rates = zeros(Int, K, k)
for i in 1:4 # NOTE: hardcoded
    for t in 1:k
        placement_rates[i, t] = placement_rates_rescaled[i, t]
    end
end

for i in 5:6 # NOTE: hardcoded
    for t in 1:k
        placement_rates[i, t] = placement_rates_rescaled[i+1, t]
    end
end

for t in 1:k
    # NOTE: hardcoded
    for original_sink_tier in [5, 8, 9, 10]
        placement_rates[K, t] += placement_rates_rescaled[original_sink_tier, t] 
    end
end

unmatched = zeros(Int, k)
for t in 1:k
    unmatched[t] = unmatched_rescaled[t]
end

placement_rates

7×4 Matrix{Int64}:
 383    84    55   14
 442   416   200   82
 422   726   598  103
  88   224   293  240
 293   436   406  136
 352   320   329  117
 535  1032  1701  968

In [14]:
unmatched

4-element Vector{Int64}:
 151
 398
 900
 625

## 3a. Latex version of the new table

In [15]:
presentation_table = zeros(Int, K+1, k)
for i in 1:K, t in 1:k
    presentation_table[i, t] = placement_rates[i, t]
end

for t in 1:k
    presentation_table[K+1, t] = unmatched[t]
end

In [16]:
include("type_allocation_base_by_applicant.jl")

Main.SBM

In [17]:
presentation_table

8×4 Matrix{Int64}:
 383    84    55   14
 442   416   200   82
 422   726   598  103
  88   224   293  240
 293   436   406  136
 352   320   329  117
 535  1032  1701  968
 151   398   900  625

In [18]:
sink_names_to_present = [
    "Public Sector",
    "Private Sector",
    "Other",
    "Unmatched"
]
SBM.nice_table(presentation_table, k, (K - k) + 1, sink_names_to_present)

┌────────────────┬────────┬────────┬────────┬────────┬────────────┐
│[1m                [0m│[1m Tier 1 [0m│[1m Tier 2 [0m│[1m Tier 3 [0m│[1m Tier 4 [0m│[1m Row Totals [0m│
├────────────────┼────────┼────────┼────────┼────────┼────────────┤
│         Tier 1 │    383 │     84 │     55 │     14 │        536 │
│         Tier 2 │    442 │    416 │    200 │     82 │       1140 │
│         Tier 3 │    422 │    726 │    598 │    103 │       1849 │
│         Tier 4 │     88 │    224 │    293 │    240 │        845 │
│  Public Sector │    293 │    436 │    406 │    136 │       1271 │
│ Private Sector │    352 │    320 │    329 │    117 │       1118 │
│          Other │    535 │   1032 │   1701 │    968 │       4236 │
│      Unmatched │    151 │    398 │    900 │    625 │       2074 │
│  Column Totals │   2666 │   3636 │   4482 │   2285 │      13069 │
└────────────────┴────────┴────────┴────────┴────────┴────────────┘
\begin{tabular}{rrrrrr}
  \hline
   & \textbf{Tier 1} & \textbf{Tier

## 4. Functions for bootstrap

In [19]:
res = (; placements = placement_rates) 
estimated_m_val = sum(m_t_values[1:k])
estimated_γ = m_t_values[1:k] / estimated_m_val

4-element Vector{Float64}:
 0.2039191671769749
 0.27824556031843234
 0.3430036742192284
 0.17483159828536435

In [20]:
function Fx(t, α, v_rel)
    return 1 - sum([-log(v_rel[j])*sum(α[1:j]) for j in 1:t])
end

function ρ_q(t, α, v_rel, k)
    Fx_km1 = Fx(k-1, α, v_rel)
    if Fx_km1 <= 0
        return -1
    end
    last_term = prod([v_rel[j] for j in t:(k-1)]) * (1 - exp((0 - Fx_km1) / sum(α[1:k])))
    return α[t] * (sum([prod([v_rel[j] for j in t:(s-1)]) * (1 - v_rel[s]) for s in t:(k-1)]) + last_term)
end

ρ_q (generic function with 1 method)

In [21]:
function estimate_parameters_stage_1(p_vec, placements, γ, m_val, k)
    v_rel = p_vec[1:k-1]
    τ = p_vec[k]
    α = τ * γ
    n_val = (m_val / τ) + 1
    objective = 0.0
    ρ_q_t = zeros(k)
    for t in 1:k
        prob = ρ_q(t, α, v_rel, k)
        if prob < 0
            return Inf
        end
        ρ_q_t[t] = prob
    end

    for t in 1:k
        prob = ρ_q_t[t]
        expectation = n_val * prob
        objective += ((sum(placements[:, t]) - expectation) ^ 2) / expectation
    end
    
    return objective
end

estimate_parameters_stage_1 (generic function with 1 method)

In [22]:
function F(x, ρ, normals, K)
    sum_base = 0.0
    for i in 1:K
        sum_base += ρ[i] * cdf(normals[i], x)
    end
    return sum_base
end

function f_integrand(integrals, x, p) 
    base_exp = exp(F(x, p.ρ, p.normals, p.K) / sum(p.α[1:p.s]))
    for i in 1:p.K
        integrals[i] = base_exp * pdf(p.normals[i], x)
    end
end

function get_integrals(x_vec, ρ, normals, α, k, K)
    # Fx_vec = [F(x0)=1, F(x1), F(x2), F(x3), ..., F(xk-1)]
    # x_vec = [x0 = 1, x1, x2, x3, ..., xk = 0]
    # x_vec[s] = x_{s-1}, so the limits of integration are and must be offset by 1 below
    all_integrals = zeros(K, k)
    for s in 1:k
        # https://docs.sciml.ai/Integrals/stable/basics/SampledIntegralProblem/ might be faster
        # if f_integrand(x) can be vectorized/sped up
        prob = IntegralProblem(IntegralFunction(f_integrand, zeros(K)), (x_vec[s+1], x_vec[s]), (; s, ρ, normals, α, K))
        sol = solve(prob, CubatureJLh(); reltol = 1e-3, abstol = 1e-3)
        integrals_result = sol.u
        # NOTE: result may be inf if alpha_1 is too small
        # NOTE: some parameter values for μ and σ may cause the cdf F_i to be NaN
        for i in 1:K 
            all_integrals[i, s] = integrals_result[i]
        end
    end
    return all_integrals
end      

function q(i, t, all_integrals, Fx_vec, α, v_rel, k)
    return α[t] * sum([(1/sum(α[1:s])) * prod([v_rel[j] for j in t:(s-1)]) * exp(-Fx_vec[s] / sum(α[1:s])) * all_integrals[i, s] for s in t:k])
end

q (generic function with 1 method)

In [23]:
function estimate_likelihood_2(p_vec, v_rel, α, placements, k, K, counter)   
    μ = p_vec[1:K]
    σ = p_vec[K+1:2K]
    ρ_vec = p_vec[2K+1:3K]
    ρ = ρ_vec / sum(ρ_vec)

    ## compute the cutoffs x and the CDF values F(x)
    normals = [truncated(Normal(μ[i], σ[i]), 0, 1) for i in 1:K]

    Fx_vec = ones(k) # sets F(x0) = 1 by default; Fx_vec = [F(x0)=1, F(x1), F(x2), F(x3), ..., F(xk-1)]
    x_vec = ones(k+1) # x_vec = [x0 = 1, x1, x2, x3, ..., xk = 0]
    x_vec[k+1] = 0.0
    for t in 1:k-1
        Fx_vec_candidate = Fx(t, α, v_rel)
        if Fx_vec_candidate <= 0.0 # TODO: if this case occurs, can we speed up q()?
            Fx_vec[t+1:k] .= 0.0
            x_vec[t+1:k] .= 0.0
            break
        end
        Fx_vec[t+1] = Fx_vec_candidate
        # there is no simple closed-form for F^{-1}(x) so this numerically computes x1, x2, x3
        x_vec[t+1] = find_zero(x -> F(x, ρ, normals, K) - Fx_vec[t+1], 0.5) 
    end 

    ρ_q_it = zeros(K, k)
    all_integrals = get_integrals(x_vec, ρ, normals, α, k, K)
    for i in 1:K, t in 1:k
        prob = q(i, t, all_integrals, Fx_vec, α, v_rel, k)
        ρ_q_it[i, t] = ρ[i] * prob
    end

    normalizer = sum(ρ_q_it)
    likelihood = 0.0
    for i in 1:K, t in 1:k
        likelihood += placements[i, t] * log(ρ_q_it[i, t] / normalizer)
    end

    counter[1] += 1
    if isnan(-likelihood) || isinf(-likelihood)
        println(all_integrals)
    end
    return -likelihood
end

estimate_likelihood_2 (generic function with 1 method)

## 5. Bootstrap

In [24]:
placements_list = []
for i in 1:K
    for t in 1:k
        for placement in 1:(res.placements[i, t])
            push!(placements_list, (i, t))
        end
    end
end

placements_list

10995-element Vector{Any}:
 (1, 1)
 (1, 1)
 (1, 1)
 (1, 1)
 (1, 1)
 (1, 1)
 (1, 1)
 (1, 1)
 (1, 1)
 (1, 1)
 (1, 1)
 (1, 1)
 (1, 1)
 ⋮
 (7, 4)
 (7, 4)
 (7, 4)
 (7, 4)
 (7, 4)
 (7, 4)
 (7, 4)
 (7, 4)
 (7, 4)
 (7, 4)
 (7, 4)
 (7, 4)

In [25]:
NUM_BOOTSTRAP_ROUNDS = 1000
NUM_BOOTSTRAP_SAMPLES = length(placements_list)
NUM_ESTIMATIONS_PER_ROUND = 5

# upper/lower bound on v_rel
upper_1 = [1.0 for _ in 1:k-1]
lower_1 = [0.0 for _ in 1:k-1]

# upper/lower bound on τ
append!(upper_1, [10.0 for _ in 1:1])
append!(lower_1, [0.25 for _ in 1:1])

search_range_1 = [(lower_1[i], upper_1[i]) for i in eachindex(upper_1)]

# upper/lower bound on the mu parameter of truncated normal
upper_2 = [4.0 for _ in 1:K]
lower_2 = [-4.0 for _ in 1:K]

# upper/lower bound on the sigma parameter of truncated normal
append!(upper_2, [10.0 for _ in 1:K])
append!(lower_2, [0.12 for _ in 1:K])

# upper/lower bound on values proportional to ρ_i
append!(upper_2, [1.0 for _ in 1:K])
append!(lower_2, [0.0 for _ in 1:K])

search_range_2 = [(lower_2[i], upper_2[i]) for i in eachindex(upper_2)]

Random.seed!(0)

bootstrap_placements_output = zeros(Int, NUM_BOOTSTRAP_ROUNDS, K, k)
bootstrap_stage_1_estimates_output = zeros(NUM_BOOTSTRAP_ROUNDS, length(upper_1))
bootstrap_stage_2_estimates_output = zeros(NUM_BOOTSTRAP_ROUNDS, length(upper_2))
bootstrap_stage_1_fitness_output = zeros(NUM_BOOTSTRAP_ROUNDS)
bootstrap_stage_2_fitness_output = zeros(NUM_BOOTSTRAP_ROUNDS)
bootstrap_counter_output = zeros(Int, NUM_BOOTSTRAP_ROUNDS)

max_evals_stage_1 = 1000000
max_evals_stage_2 = 100000

Threads.@threads for bootstrap_round in 1:NUM_BOOTSTRAP_ROUNDS
    println("Starting round $bootstrap_round")
    start_time = Dates.datetime2unix(Dates.now())

    bootstrap_placements_list = rand(placements_list, NUM_BOOTSTRAP_SAMPLES)
    bootstrap_placements = zeros(Int, K, k)
    
    for placement in bootstrap_placements_list
        bootstrap_placements[placement[1], placement[2]] += 1
    end
    
    bootstrap_placements_output[bootstrap_round, :, :] = bootstrap_placements[:, :]
    
    sol_res_1 = bboptimize(p -> estimate_parameters_stage_1(p, bootstrap_placements, estimated_γ, estimated_m_val, k), SearchRange = search_range_1, MaxFuncEvals = max_evals_stage_1, TraceMode = :silent) 
    # MaxTime = 60.0, MaxFuncEvals = 500000,
    sol_1 = best_candidate(sol_res_1)
    sol_1_fitness = best_fitness(sol_res_1)

    bootstrap_v_rel = sol_1[1:k-1]
    bootstrap_τ = sol_1[k]
    bootstrap_α = bootstrap_τ * estimated_γ

    bootstrap_stage_1_estimates_output[bootstrap_round, :] = sol_1[:]
    bootstrap_stage_1_fitness_output[bootstrap_round] = sol_1_fitness

    println("  round $bootstrap_round s1 best estimate: fitness $sol_1_fitness")

    best_fitness_val = Inf
    best_counter = 0
    best_sol = nothing
    stage_2_start_time = Dates.datetime2unix(Dates.now())
    for bootstrap_round_estimation in 1:NUM_ESTIMATIONS_PER_ROUND
        counter = [0]
        sol_res_2 = bboptimize(p -> estimate_likelihood_2(p, bootstrap_v_rel, bootstrap_α, bootstrap_placements, k, K, counter), SearchRange = search_range_2, MaxFuncEvals = max_evals_stage_2, TraceMode = :silent) 
        # MaxTime = 60.0, MaxFuncEvals = 500000,
        sol_2 = best_candidate(sol_res_2)
        sol_2_fitness = best_fitness(sol_res_2)
        if sol_2_fitness < best_fitness_val
            time_so_far = Dates.datetime2unix(Dates.now())
            println("    improvement in round $bootstrap_round est. $bootstrap_round_estimation: $sol_2_fitness (counter = $(counter[1])) < $best_fitness_val (counter = $best_counter); ($(time_so_far - start_time)s total)")
            best_fitness_val = sol_2_fitness
            best_counter = counter[1]
            best_sol = sol_2
        end
    end
    stage_2_end_time = Dates.datetime2unix(Dates.now())
    bootstrap_stage_2_estimates_output[bootstrap_round, :] = best_sol[:]
    bootstrap_stage_2_fitness_output[bootstrap_round] = best_fitness_val
    bootstrap_counter_output[bootstrap_round] = best_counter
    end_time = Dates.datetime2unix(Dates.now())
    println("round $bootstrap_round s2 best estimate: fitness $best_fitness_val after $best_counter iterations (runtime: $(end_time - start_time)s, avg s2 runtime: $((stage_2_end_time - stage_2_start_time)/NUM_ESTIMATIONS_PER_ROUND)s)")
end
save("estimated_parameters_truncated_normal_bootstrap.jld", "base_placements", res.placements, "estimated_m_val", estimated_m_val, "estimated_γ", estimated_γ, "base_m_t_values", m_t_values[1:k], "placements", bootstrap_placements_output, "stage_1_estimates", bootstrap_stage_1_estimates_output, "stage_2_estimates", bootstrap_stage_2_estimates_output, "stage_1_fitness", bootstrap_stage_1_fitness_output, "stage_2_fitness", bootstrap_stage_2_fitness_output, "counter", bootstrap_counter_output, "k", k, "K", K, "max_evals_stage_1", max_evals_stage_1, "max_evals_stage_2", max_evals_stage_2, "bootstrap_rounds", NUM_BOOTSTRAP_ROUNDS, "estimations_per_round", NUM_ESTIMATIONS_PER_ROUND, "bootstrap_samples", NUM_BOOTSTRAP_SAMPLES);

Starting round 897
Starting round 213
Starting round 319
Starting round 425
Starting round 689
Starting round 741
Starting round 637
Starting round 372
Starting round 949
Starting round 1
Starting round 54
Starting round 793
Starting round 266
Starting round 584
Starting round 107
Starting round 531
Starting round 845
Starting round 160
Starting round 478
  round 949 s1 best estimate: fitness 0.0
  round 793 s1 best estimate: fitness 0.0
  round 1 s1 best estimate: fitness 0.0
  round 107 s1 best estimate: fitness 0.0
  round 584 s1 best estimate: fitness 0.0
  round 372 s1 best estimate: fitness 0.0
  round 425 s1 best estimate: fitness 0.0
  round 845 s1 best estimate: fitness 2.995294802118437e-29
  round 319 s1 best estimate: fitness 0.0
  round 689 s1 best estimate: fitness 0.0
  round 478 s1 best estimate: fitness 0.0
  round 637 s1 best estimate: fitness 0.0
  round 531 s1 best estimate: fitness 6.398364886703493e-29
  round 213 s1 best estimate: fitness 0.0
  round 897 s1 best 