# Placement Outcome Processor

In [1]:
using Pkg
Pkg.activate(".")
# for pkg in ["HTTP", "JSON", "Distributions", "Random", "PrettyTables"]
#     Pkg.add(pkg)
# end
Pkg.instantiate()

[32m[1m  Activating[22m[39m project at `C:\Users\jbrig\Documents\research\mapinator_2024\notes\december_presentation`


In [2]:
"""
SBM API Data Filter (Julia Version)
Adapted from James Yuming Yu (5 June 2023)

Silas Kwok, 31 July 2023

Adapted and modified for use with full estimation by James Yu, 17 September 2023
Updated by James Yu, 19 November 2024, 16 December 2024
"""

using HTTP, JSON
DEBUG_LEVEL = 1

1

In [3]:
function matches(keywords, phrase)
    # checks if any of the keywords are in the phrase
    for keyword in keywords
        if occursin(keyword, phrase)
            return true
        end
    end
    return false
end

matches (generic function with 1 method)

### STEP 1a
Retrieve the placement outcomes.

In [4]:
# NOTE: request times out after 120 seconds. If the data takes longer than 120s to download, adjust the timeout.
placements = nothing
try
    mapinator_data = HTTP.get("https://support.econjobmarket.org/api/mapinator", timeout = 120)
    placements = JSON.parse(String(mapinator_data.body))
catch e
    error("Failed to retrieve data from the API: $e")
end

29915-element Vector{Any}:
 Dict{String, Any}("disappeared" => 0, "to_shortname" => "Bates White", "created_at" => "2024-06-06 11:33:28", "to_name" => "Bates White", "to_department" => "All departments", "to_latitude" => 38.9030567, "name" => "Econometrics", "category_id" => 2, "from_shortname" => "Economics, UCLA", "to_oid" => 3070…)
 Dict{String, Any}("disappeared" => 1, "to_shortname" => "Ocean and Crow Studios Inc", "created_at" => "2024-06-27 10:14:19", "to_name" => "Ocean and Crow Studios Inc", "to_department" => "All departments", "to_latitude" => 49.2714425, "name" => "Health; Education; Welfare", "category_id" => 20, "from_shortname" => "Economics, UCLA", "to_oid" => 893…)
 Dict{String, Any}("disappeared" => 0, "to_shortname" => "App Econ, U Autònoma Barcelona", "created_at" => "2024-07-04 19:30:12", "to_name" => "Universitat Autònoma de Barcelona", "to_department" => "Department of Applied Economics", "to_latitude" => 41.50174815758906, "name" => "Labor; Demographic Economics

### STEP 1b
Group placements by applicant ID and eliminate "oid 893" positions (Ocean and Crow).

In [5]:
# TODO: are the json fields strictly typed? is there a way to easily compensate if the variable types change?

applicant_outcomes = Dict{Any, Vector}()
applicant_ids = Set{Any}()
num_outcomes_selected = 0

for outcome in placements
    push!(applicant_ids, outcome["aid"])
    #if outcome["to_oid"] != 893
    push!(get!(applicant_outcomes, outcome["aid"], Vector()), outcome)
    num_outcomes_selected += 1
    #else
    #    if parse(Int, split(outcome["startdate"], "-")[1]) <= MAX_893_YEAR
    #        push!(get!(applicant_outcomes, outcome["aid"], Vector()), outcome)
    #        num_outcomes_selected += 1
             #push!(get!(oid_893_outcomes, outcome["aid"], Vector()), outcome)
    #    end
    #end
end

if DEBUG_LEVEL > 0
    println("  ", length(placements), " total placement outcomes")
    #println("  -", length(placements) - num_outcomes_selected, " outcomes at Ocean and Crow after $MAX_893_YEAR")
    #println("  ", num_outcomes_selected, " remaining outcomes")
    println()
    println("  ", length(applicant_ids), " total applicants with placements")
    #println("  -", length(applicant_ids) - length(applicant_outcomes), " total applicants with exclusively outcomes at Ocean and Crow after $MAX_893_YEAR")
    #println("  ", length(applicant_outcomes), " remaining applicants")
end

  29915 total placement outcomes

  21224 total applicants with placements


### STEP 2a
Determine the first placement outcome of each individual that occurred after the individual graduated.\
We need to know what the first outcome is BEFORE we filter on types of outcomes, as otherwise we will get incorrectly-identified "first-time positions".

### STEP 2b
Remove postdoc outcomes so applicants with postdoc positions aren't automatically removed from the data.\
Postdocs are concurrent so the placements are redundant on top of e.g. concurrently-awarded assistant professor positions.

In [6]:
postdoc_counter = 0
finalized_applicant_outcomes = Dict{Any, Any}()

# TODO: if an applicant received an assistant professor position starting two years from now, 
# but received a lower-tier position immediately at the same time, 
# what would their position received on the job market be?

for applicant_id in keys(applicant_outcomes)
    for outcome in applicant_outcomes[applicant_id]
        # if you wish to display postdocs in the sinks, remove the if statement condition 
        #   and set Post-Doc to have higher priority than Assistant Professor below
        # alternatively, to only include postdocs in the sinks that did not receive professorships,
        #   do not alter the below code, and instead conduct a second pass 
        #   to fill in postdoc outcomes for individuals with no professorships
        if outcome["position_name"] != "Post-Doc"
            if !haskey(finalized_applicant_outcomes, applicant_id)
                # just add the outcome if the applicant doesn't have any yet
                finalized_applicant_outcomes[applicant_id] = outcome
            else
                # otherwise, the applicant does have at least one other outcome
                if outcome["startdate"] < finalized_applicant_outcomes[applicant_id]["startdate"]
                    # take the earliest outcome of the two and ignore the other
                    finalized_applicant_outcomes[applicant_id] = outcome
                elseif outcome["startdate"] == finalized_applicant_outcomes[applicant_id]["startdate"]
                    # sometimes we may have multiple outcomes that started on the same date - follow priority listing

                    # case 1: existing outcome is 893 and new outcome is not 893. use new outcome.
                    if ((finalized_applicant_outcomes[applicant_id]["to_oid"] == 893) && (outcome["to_oid"] != 893))
                        finalized_applicant_outcomes[applicant_id] = outcome
                    # case 2: existing outcome is 893 and new outcome is 893. compare using priority listing.
                    # case 3: existing outcome is not 893 and new outcome is not 893. compare using priority listing.
                    elseif !((finalized_applicant_outcomes[applicant_id]["to_oid"] != 893) && (outcome["to_oid"] == 893))
                        if outcome["position_name"] in ["Assistant Professor"]
                            finalized_applicant_outcomes[applicant_id] = outcome
                        elseif outcome["position_name"] in ["Consultant"] && !(finalized_applicant_outcomes[applicant_id]["position_name"] in ["Assistant Professor"])
                            finalized_applicant_outcomes[applicant_id] = outcome
                        elseif outcome["position_name"] in ["Other Academic", "Other Non-Academic", "Lecturer"] && !(finalized_applicant_outcomes[applicant_id]["position_name"] in ["Assistant Professor", "Consultant"])
                            finalized_applicant_outcomes[applicant_id] = outcome
                        end
                    end
                    # case 4: existing outcome is not 893 and new outcome is 893. use existing outcome (do nothing)
                end
            end
        else
            postdoc_counter += 1
        end
    end
end

if DEBUG_LEVEL > 0
    println("  -", length(applicant_outcomes) - length(finalized_applicant_outcomes), " total applicants removed due to only being postdocs (", 
        postdoc_counter, " total postdoc placements detected)")
    println("  ", length(finalized_applicant_outcomes), " applicants remaining")
    println()
end

  -2244 total applicants removed due to only being postdocs (4425 total postdoc placements detected)
  18980 applicants remaining



### STEP 3
Eliminate everything except:
- Assistant Professor
- Consultant
- Other Academic
- Other Non-Academic
- Lecturer

In [7]:
# add "Lecturer" if adjusting sinks later on
valid_labels = Set(["Assistant Professor", "Consultant", "Other Academic", "Other Non-Academic", "Lecturer"])
irrelevant_counter = 0
removed_labels = Set()

# do not eliminate 893 positions
included_893 = 0
for applicant_id in copy(keys(finalized_applicant_outcomes))
    outcome = finalized_applicant_outcomes[applicant_id]
    if !(outcome["position_name"] in valid_labels)
        if outcome["to_oid"] != 893
            push!(removed_labels, outcome["position_name"])
            delete!(finalized_applicant_outcomes, applicant_id)
            irrelevant_counter += 1
        else
            included_893 += 1
        end
    end
end

if DEBUG_LEVEL > 0
    println("  -", irrelevant_counter, " irrelevant applicants removed from the following classes of positions:")
    println(removed_labels)
    println("  ", length(finalized_applicant_outcomes), " applicants remaining after irrelevant-position applicants removed")
    maintained_labels = Dict{Any, Int}()
    oid_893_labels = Dict{Any, Int}()
    for applicant_id in keys(finalized_applicant_outcomes)
        outcome = finalized_applicant_outcomes[applicant_id]
        position_name = outcome["position_name"]
        if outcome["to_oid"] != 893
            if haskey(maintained_labels, position_name)
                maintained_labels[position_name] += 1
            else
                maintained_labels[position_name] = 1
            end
        else
            if haskey(oid_893_labels, position_name)
                oid_893_labels[position_name] += 1
            else
                oid_893_labels[position_name] = 1
            end
        end
    end
    println()
    println(included_893, " applicants with initial placements in 893")
    println()
end

println(maintained_labels, " ", sum(values(maintained_labels)), " total regular")
println()
println(oid_893_labels, " ", sum(values(oid_893_labels)), " total in 893")

  -1121 irrelevant applicants removed from the following classes of positions:
Set(Any["Tenured Professor", "Untenured Professor", "Associate Professor", "Assistant, Associate or Full Professor", "Professor Any Level", "Full Professor", "Temporary Lecturer", "Visiting Professor/Lecturer/Instructor", "Assistant or Associate Professor"])
  17859 applicants remaining after irrelevant-position applicants removed

67 applicants with initial placements in 893

Dict{Any, Int64}("Other Non-Academic" => 2948, "Consultant" => 519, "Other Academic" => 1486, "Assistant Professor" => 8756, "Lecturer" => 1033) 14742 total regular
Dict{Any, Int64}("Tenured Professor" => 1, "Associate Professor" => 17, "Assistant, Associate or Full Professor" => 1, "Other Non-Academic" => 1108, "Assistant Professor" => 1485, "Full Professor" => 4, "Other Academic" => 187, "Professor Any Level" => 18, "Temporary Lecturer" => 4, "Consultant" => 176, "Assistant or Associate Professor" => 2, "Visiting Professor/Lecturer/I

### STEP 4
Filter-by-year.

In [8]:
sorted_by_year = Dict{Any, Dict}()
removed_year_placed = 0

by_year_893 = Dict{Any, Dict}()

remove_years = [] # remove all 2022+ entries
# "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019",
# "2021", "2022", "2023", "2024", "2025", "2026"

for applicant_id in copy(keys(finalized_applicant_outcomes))
    outcome = finalized_applicant_outcomes[applicant_id]
    if matches(remove_years, outcome["startdate"])
        removed_year_placed += 1
        delete!(finalized_applicant_outcomes, applicant_id)
    else
        if outcome["to_oid"] != 893
            push!(get!(sorted_by_year, parse(Int, split(outcome["startdate"], "-")[1]), Dict()), applicant_id => outcome)
        else
            push!(get!(by_year_893, parse(Int, split(outcome["startdate"], "-")[1]), Dict()), applicant_id => outcome)
        end
    end
end

if DEBUG_LEVEL > 0
    println("  -", removed_year_placed, " applicants removed due to placement in years to remove")
    println("  ", length(finalized_applicant_outcomes), " applicants remaining after year corrections")
    println()
end

for key in sort(collect(keys(sorted_by_year)))
    println("Year ", key, " has ", length(sorted_by_year[key]), " successful placement outcomes")
end
println()

for key in sort(collect(keys(by_year_893)))
    println("Year ", key, " has ", length(by_year_893[key]), " unsuccessful placement outcomes")
end

  -0 applicants removed due to placement in years to remove
  17859 applicants remaining after year corrections

Year 2003 has 35 successful placement outcomes
Year 2004 has 30 successful placement outcomes
Year 2005 has 229 successful placement outcomes
Year 2006 has 28 successful placement outcomes
Year 2007 has 193 successful placement outcomes
Year 2008 has 141 successful placement outcomes
Year 2009 has 303 successful placement outcomes
Year 2010 has 333 successful placement outcomes
Year 2011 has 407 successful placement outcomes
Year 2012 has 468 successful placement outcomes
Year 2013 has 502 successful placement outcomes
Year 2014 has 550 successful placement outcomes
Year 2015 has 802 successful placement outcomes
Year 2016 has 1637 successful placement outcomes
Year 2017 has 1114 successful placement outcomes
Year 2018 has 1155 successful placement outcomes
Year 2019 has 1399 successful placement outcomes
Year 2020 has 1370 successful placement outcomes
Year 2021 has 1210 su

### STEP 5
Save to disk.

In [9]:
total_check = sum(length(value) for value in values(sorted_by_year)) + sum(length(value) for value in values(by_year_893))
println("Total " * "$total_check" * " applicants in JSON file (compare to" * " $(length(finalized_applicant_outcomes)) " 
    * "applicants in finalized_applicant_outcomes: " * "$(length(finalized_applicant_outcomes) == total_check ? "SUCCESS" : "FAIL")" * ")")

json_str = JSON.json(sorted_by_year, 4)  
open("to_from_by_year_mapinator_api.json", "w") do f
    write(f, json_str)
end

json_str_893 = JSON.json(by_year_893, 4)  
open("to_from_by_year_893_mapinator_api.json", "w") do f
    write(f, json_str_893)
end

Total 17859 applicants in JSON file (compare to 17859 applicants in finalized_applicant_outcomes: SUCCESS)


3559405

In [10]:
sorted_by_year

Dict{Any, Dict} with 24 entries:
  2024 => Dict{Any, Any}("62687"=>Dict{String, Any}("disappeared"=>0, "to_short…
  2004 => Dict{Any, Any}("24943"=>Dict{String, Any}("disappeared"=>nothing, "to…
  2023 => Dict{Any, Any}("56205"=>Dict{String, Any}("disappeared"=>0, "to_short…
  2010 => Dict{Any, Any}("5422"=>Dict{String, Any}("disappeared"=>nothing, "to_…
  2006 => Dict{Any, Any}("2139"=>Dict{String, Any}("disappeared"=>nothing, "to_…
  2020 => Dict{Any, Any}("45907"=>Dict{String, Any}("disappeared"=>0, "to_short…
  2003 => Dict{Any, Any}("17827"=>Dict{String, Any}("disappeared"=>nothing, "to…
  2018 => Dict{Any, Any}("37196"=>Dict{String, Any}("disappeared"=>nothing, "to…
  2017 => Dict{Any, Any}("27626"=>Dict{String, Any}("disappeared"=>0, "to_short…
  2025 => Dict{Any, Any}("62504"=>Dict{String, Any}("disappeared"=>0, "to_short…
  2011 => Dict{Any, Any}("59924"=>Dict{String, Any}("disappeared"=>0, "to_short…
  2009 => Dict{Any, Any}("1886"=>Dict{String, Any}("disappeared"=>nothing, "

In [11]:
by_year_893

Dict{Any, Dict} with 21 entries:
  2024 => Dict{Any, Any}("62884"=>Dict{String, Any}("disappeared"=>0, "to_short…
  2023 => Dict{Any, Any}("41022"=>Dict{String, Any}("disappeared"=>1, "to_short…
  2010 => Dict{Any, Any}("1495"=>Dict{String, Any}("disappeared"=>0, "to_shortn…
  2020 => Dict{Any, Any}("51250"=>Dict{String, Any}("disappeared"=>0, "to_short…
  2018 => Dict{Any, Any}("30648"=>Dict{String, Any}("disappeared"=>0, "to_short…
  2017 => Dict{Any, Any}("40521"=>Dict{String, Any}("disappeared"=>0, "to_short…
  2001 => Dict{Any, Any}("52049"=>Dict{String, Any}("disappeared"=>1, "to_short…
  2025 => Dict{Any, Any}("50537"=>Dict{String, Any}("disappeared"=>1, "to_short…
  2009 => Dict{Any, Any}("8071"=>Dict{String, Any}("disappeared"=>0, "to_shortn…
  2011 => Dict{Any, Any}("51564"=>Dict{String, Any}("disappeared"=>0, "to_short…
  2013 => Dict{Any, Any}("26793"=>Dict{String, Any}("disappeared"=>0, "to_short…
  2007 => Dict{Any, Any}("32"=>Dict{String, Any}("disappeared"=>0, "to_short