# Structural Estimation of Market Parameters

### Sequential Estimator, Truncated Normal

### Test of Tier Order Consistency

In [1]:
using Pkg
Pkg.activate(".")
#for pkg in ["BlackBoxOptim", "Combinatorics", "Cubature", "Distributions", "Integrals", "Roots", "JLD", "StatProfilerHTML"] # 
#    Pkg.add(pkg)
#end

#Pkg.instantiate()

using Base.Threads, BlackBoxOptim, Combinatorics, Cubature, Distributions, Integrals, Random, Roots, JLD

[32m[1m  Activating[22m[39m project at `C:\Users\jbrig\Documents\research\mapinator_2024`


In [2]:
placement_matrices = load("placement_matrices.jld")

placement_rates_raw = placement_matrices["placement_matrix"]
unmatched_raw = placement_matrices["unmatched"]
k = size(placement_rates_raw)[2]
K = size(placement_rates_raw)[1]

m_t_placements = zeros(Int, k)
for t in 1:k
    m_t_placements[t] = sum(placement_rates_raw[:, t]) + unmatched_raw[t]
end

# conduct adjustment using the relative m_t values
adjustment_matrix = load("adjustment_matrix.jld")
m_t_values = adjustment_matrix["data"][:, 1]

placement_rates_rescaled = zeros(Int, K, k)
for i in 1:K
    for t in 1:k
        placement_rates_rescaled[i, t] = round(m_t_values[t] * (placement_rates_raw[i, t] / m_t_placements[t]))
    end
end

println("original dimensions: ", K, " by ", k)
k -= 1
K = k + 3
println("new dimensions: ", K, " by ", k)

placement_rates = zeros(Int, K, k)
for i in 1:4 # NOTE: hardcoded
    for t in 1:k
        placement_rates[i, t] = placement_rates_rescaled[i, t]
    end
end

for i in 5:6 # NOTE: hardcoded
    for t in 1:k
        placement_rates[i, t] = placement_rates_rescaled[i+1, t]
    end
end

for t in 1:k
    # NOTE: hardcoded
    for original_sink_tier in [5, 8, 9, 10]
        placement_rates[K, t] += placement_rates_rescaled[original_sink_tier, t] 
    end
end

estimated_m_val = sum(m_t_values[1:k])

placement_rates

original dimensions: 10 by 5
new dimensions: 7 by 4


7×4 Matrix{Int64}:
 383    84    55   14
 442   416   200   82
 422   726   598  103
  88   224   293  240
 293   436   406  136
 352   320   329  117
 535  1032  1701  968

In [3]:
function Fx(t, α, v_rel)
    return 1 - sum([-log(v_rel[j])*sum(α[1:j]) for j in 1:t])
end

function ρ_q(t, α, v_rel, k)
    Fx_km1 = Fx(k-1, α, v_rel)
    if Fx_km1 <= 0
        return -1
    end
    last_term = prod([v_rel[j] for j in t:(k-1)]) * (1 - exp((0 - Fx_km1) / sum(α[1:k])))
    return α[t] * (sum([prod([v_rel[j] for j in t:(s-1)]) * (1 - v_rel[s]) for s in t:(k-1)]) + last_term)
end

ρ_q (generic function with 1 method)

In [4]:
function F(x, ρ, normals, K)
    sum_base = 0.0
    for i in 1:K
        sum_base += ρ[i] * cdf(normals[i], x)
    end
    return sum_base
end

function f_integrand(integrals, x, p) 
    base_exp = exp(F(x, p.ρ, p.normals, p.K) / sum(p.α[1:p.s]))
    for i in 1:p.K
        integrals[i] = base_exp * pdf(p.normals[i], x)
    end
end

function get_integrals(x_vec, ρ, normals, α, k, K)
    # Fx_vec = [F(x0)=1, F(x1), F(x2), F(x3), ..., F(xk-1)]
    # x_vec = [x0 = 1, x1, x2, x3, ..., xk = 0]
    # x_vec[s] = x_{s-1}, so the limits of integration are and must be offset by 1 below
    all_integrals = zeros(K, k)
    for s in 1:k
        # https://docs.sciml.ai/Integrals/stable/basics/SampledIntegralProblem/ might be faster
        # if f_integrand(x) can be vectorized/sped up
        prob = IntegralProblem(IntegralFunction(f_integrand, zeros(K)), (x_vec[s+1], x_vec[s]), (; s, ρ, normals, α, K))
        sol = solve(prob, CubatureJLh(); reltol = 1e-3, abstol = 1e-3)
        integrals_result = sol.u
        # NOTE: result may be inf if alpha_1 is too small
        # NOTE: some parameter values for μ and σ may cause the cdf F_i to be NaN
        for i in 1:K 
            all_integrals[i, s] = integrals_result[i]
        end
    end
    return all_integrals
end      

function q(i, t, all_integrals, Fx_vec, α, v_rel, k)
    return α[t] * sum([(1/sum(α[1:s])) * prod([v_rel[j] for j in t:(s-1)]) * exp(-Fx_vec[s] / sum(α[1:s])) * all_integrals[i, s] for s in t:k])
end

q (generic function with 1 method)

In [5]:
function estimate_parameters_stage_1(p_vec, placements, γ, m_val, k, counter)
    v_rel = p_vec[1:k-1]
    τ = p_vec[k]
    α = τ * γ
    n_val = (m_val / τ) + 1
    objective = 0.0
    ρ_q_t = zeros(k)

    counter[1] += 1
    
    for t in 1:k
        prob = ρ_q(t, α, v_rel, k)
        if prob < 0
            return Inf
        end
        ρ_q_t[t] = prob
    end

    for t in 1:k
        prob = ρ_q_t[t]
        expectation = n_val * prob
        objective += ((sum(placements[:, t]) - expectation) ^ 2) / expectation
    end
    
    return objective
end

estimate_parameters_stage_1 (generic function with 1 method)

In [6]:
function estimate_likelihood_2(p_vec, v_rel, α, placements, k, K, counter, get_full_likelihood = false)   
    μ = p_vec[1:K]
    σ = p_vec[K+1:2K]
    ρ_vec = p_vec[2K+1:3K]
    ρ = ρ_vec / sum(ρ_vec)

    ## compute the cutoffs x and the CDF values F(x)
    normals = [truncated(Normal(μ[i], σ[i]), 0, 1) for i in 1:K]

    Fx_vec = ones(k) # sets F(x0) = 1 by default; Fx_vec = [F(x0)=1, F(x1), F(x2), F(x3), ..., F(xk-1)]
    x_vec = ones(k+1) # x_vec = [x0 = 1, x1, x2, x3, ..., xk = 0]
    x_vec[k+1] = 0.0
    for t in 1:k-1
        Fx_vec_candidate = Fx(t, α, v_rel)
        if Fx_vec_candidate <= 0.0 # TODO: if this case occurs, can we speed up q()?
            Fx_vec[t+1:k] .= 0.0
            x_vec[t+1:k] .= 0.0
            break
        end
        Fx_vec[t+1] = Fx_vec_candidate
        # there is no simple closed-form for F^{-1}(x) so this numerically computes x1, x2, x3
        x_vec[t+1] = find_zero(x -> F(x, ρ, normals, K) - Fx_vec[t+1], 0.5) 
    end 

    ρ_q_it = zeros(K, k)
    all_integrals = get_integrals(x_vec, ρ, normals, α, k, K)
    for i in 1:K, t in 1:k
        prob = q(i, t, all_integrals, Fx_vec, α, v_rel, k)
        ρ_q_it[i, t] = ρ[i] * prob
    end

    normalizer = sum(ρ_q_it)
    likelihood = 0.0
    
    if get_full_likelihood
        likelihood_probability_vector = zeros(k * K)
        total_placements_vector = zeros(Int, k * K)
        vector_counter = 1
        for i in 1:K
            for t in 1:k
                likelihood_probability_vector[vector_counter] = (ρ_q_it[i, t] / normalizer)
                total_placements_vector[vector_counter] = placements[i, t]
                vector_counter += 1
            end
        end
        M = sum(total_placements_vector)
        likelihood = loglikelihood(Multinomial(M, likelihood_probability_vector), total_placements_vector)
    else
        for i in 1:K, t in 1:k
            likelihood += placements[i, t] * log(ρ_q_it[i, t] / normalizer)
        end
    end
    
    counter[1] += 1
    #if isnan(-likelihood) || isinf(-likelihood)
    #    println(all_integrals)
    #end
    return -likelihood
end

estimate_likelihood_2 (generic function with 2 methods)

In [7]:
Random.seed!(0)

# upper/lower bound on v_rel
upper_1 = [1.0 for _ in 1:k-1]
lower_1 = [0.0 for _ in 1:k-1]

# upper/lower bound on τ
append!(upper_1, [10.0 for _ in 1:1])
append!(lower_1, [0.25 for _ in 1:1])

search_range_1 = [(lower_1[i], upper_1[i]) for i in eachindex(upper_1)]

# upper/lower bound on the mu parameter of truncated normal
upper_2 = [4.0 for _ in 1:K]
lower_2 = [-4.0 for _ in 1:K]

# upper/lower bound on the sigma parameter of truncated normal
append!(upper_2, [10.0 for _ in 1:K])
append!(lower_2, [0.12 for _ in 1:K])

# upper/lower bound on values proportional to ρ_i
append!(upper_2, [1.0 for _ in 1:K])
append!(lower_2, [0.0 for _ in 1:K])

search_range_2 = [(lower_2[i], upper_2[i]) for i in eachindex(upper_2)]

best_fitness_stage_1 = [Inf]
best_fitness_stage_2 = [Inf]
best_indices_stage_1 = zeros(Int, k)
best_indices_stage_2 = zeros(Int, k)
best_v_rel_stage_1 = zeros(k-1)
best_v_rel_stage_2 = zeros(k-1)

Threads.@threads for permutation_indices in collect(permutations(1:k))
    println("testing ", permutation_indices)
    new_placement_rates = zeros(Int, K, k)
    for (i, i_old) in enumerate(permutation_indices) # placements
        for (t, t_old) in enumerate(permutation_indices)
            new_placement_rates[i, t] = placement_rates[i_old, t_old]
        end
    end
    for i in k+1:K # sinks
        for (t, t_old) in enumerate(permutation_indices)
            new_placement_rates[i, t] = placement_rates[i, t_old]
        end
    end

    estimated_γ = zeros(k)
    for (t, t_old) in enumerate(permutation_indices)
        estimated_γ[t] = m_t_values[t_old] / estimated_m_val
    end

    counter_1 = [0]
    sol_res_1 = bboptimize(p -> estimate_parameters_stage_1(p, new_placement_rates, estimated_γ, estimated_m_val, k, counter_1), SearchRange = search_range_1, MaxFuncEvals = 1000000, TraceMode = :silent) 
    sol_1 = best_candidate(sol_res_1)
    estimated_v_rel = sol_1[1:k-1]
    estimated_τ = sol_1[k]
    estimated_α = estimated_τ * estimated_γ
    fitness_stage_1 = best_fitness(sol_res_1)
    println(" s1: ", permutation_indices, " ", fitness_stage_1, " ", estimated_v_rel, " ", counter_1)
    if fitness_stage_1 < best_fitness_stage_1[1]
        best_fitness_stage_1[1] = fitness_stage_1
        best_indices_stage_1[:] = permutation_indices[:]
        best_v_rel_stage_1[:] = estimated_v_rel[:]
    end
    counter_2 = [0]
    sol_res_2 = bboptimize(p -> estimate_likelihood_2(p, estimated_v_rel, estimated_α, new_placement_rates, k, K, counter_2), SearchRange = search_range_2, MaxFuncEvals = 100000, TraceMode = :silent) 
    sol_2 = best_candidate(sol_res_2)

    full_likelihood = estimate_likelihood_2(sol_2, estimated_v_rel, estimated_α, new_placement_rates, k, K, [0], true)
    println(" s2: ", permutation_indices, " ", full_likelihood, " ", estimated_v_rel, " ", counter_2)
    if full_likelihood < best_fitness_stage_2[1]
        best_fitness_stage_2[1] = full_likelihood
        best_indices_stage_2[:] = permutation_indices[:]
        best_v_rel_stage_2[:] = estimated_v_rel[:]
    end
end
println()
println("stage 1 best results:")
println("  indices: ", best_indices_stage_1)
println("  fitness: ", best_fitness_stage_1[1])
println("    v_rel: ", best_v_rel_stage_1)
println()
println("stage 2 best results:")
println("     indices: ", best_indices_stage_2)
println("  likelihood: ", best_fitness_stage_2[1])
println("       v_rel: ", best_v_rel_stage_2)

testing [4, 2, 3, 1]
testing [4, 3, 1, 2]
testing [3, 2, 4, 1]
testing [3, 4, 2, 1]
testing [4, 2, 1, 3]
testing [4, 1, 3, 2]
testing [2, 3, 1, 4]
testing [2, 1, 3, 4]
testing [2, 4, 3, 1]
testing [3, 2, 1, 4]
testing [2, 4, 1, 3]
testing [3, 1, 4, 2]
testing [1, 2, 3, 4]
testing [1, 3, 2, 4]
testing [3, 1, 2, 4]
testing [4, 3, 2, 1]
testing [3, 4, 1, 2]
testing [1, 4, 2, 3]
testing [4, 1, 2, 3]
 s1: [1, 3, 2, 4] 19.941354081857387 [0.3531339188181636, 0.9999999999999999, 0.5800498350373798] [20289]
 s1: [3, 2, 1, 4] 44.24529056453281 [0.9999999999999999, 1.0, 0.483202091540371] [21253]
 s1: [2, 4, 1, 3] 68.51899978868244 [0.7315889861112264, 0.9999999999999999, 0.7442308141581153] [24586]
 s1: [3, 1, 2, 4] 44.245290564532766 [0.9999999999999999, 0.9999999999999998, 0.4832020886975329] [36135]
 s1: [4, 2, 1, 3] 72.39909221885024 [0.9999999999999999, 1.0, 0.6585040623235685] [24068]
 s1: [2, 3, 1, 4] 40.92281131079786 [0.7595015046558261, 0.9999999999999999, 0.5264594457796675] [35427]
