# Data compiler

In [144]:
using JSON
using CSV
#using Downloads
using DataFrames
using HTTP

# Wikidata data

Inspired by this [post](https://github.com/datasets/country-codes/issues/53#issuecomment-338925597):

https://query.wikidata.org/#SELECT%20%2a%20%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP298%20%3Fcode%0A%7D%20ORDER%20BY%20%3Fcode

```sql
SELECT * 
WHERE {
  ?item wdt:P298 ?code
} ORDER BY ?code
```

This gives us a list of all entries that have the `P298` which is the [ISO 3166-1 alpha-3 code](https://www.wikidata.org/wiki/Property:P298) (You can find out the id of the property by looking at the url: `https://www.wikidata.org/wiki/Property:P298` of the property).

We download the list as csv file. You find this file in this directory as `wiki data urls.csv`.

In [170]:
wikidata = CSV.read("wiki data urls.csv", DataFrame)
wikiIDS = Dict()

for row in eachrow(urls)
    id = replace(row["item"], "http://www.wikidata.org/entity/" => "")
    wikiIDS[row["code"]] = getWikiDataEntry(id)
end

wikiIDS

Dict{Any, Any} with 269 entries:
  "FRA" => Dict{String, Any}("descriptions"=>Dict{String, Any}("scn"=>Dict{Stri…
  "MAF" => Dict{String, Any}("descriptions"=>Dict{String, Any}("eo"=>Dict{Strin…
  "MNG" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{Strin…
  "TKL" => Dict{String, Any}("descriptions"=>Dict{String, Any}("eo"=>Dict{Strin…
  "MYT" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{Strin…
  "VGB" => Dict{String, Any}("descriptions"=>Dict{String, Any}("eo"=>Dict{Strin…
  "ISR" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{Strin…
  "SGP" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{Strin…
  "SYC" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{Strin…
  "HVO" => Dict{String, Any}("descriptions"=>Dict{String, Any}("eu"=>Dict{Strin…
  "KIR" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{Strin…
  "BRA" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{St

In [175]:
open("countries_wiki.json", "w") do f
    JSON.print(f, wikiIDS)
end

In [184]:
functio
aliases = []
for (language_key, language) in wikiIDS["FRA"]["aliases"]
    println("language_key: ", language_key)
    # println(language)
    for alias in language
        # println(alias["value"])
        push!(aliases, alias["value"])
    end
end

language_key: na
France
language_key: scn
France
Franzia
Regnu di Francia
Franza
language_key: lzh
法蘭西
法蘭西共和國
language_key: am
ፍራንስ
language_key: cs
Francouzská republika
language_key: kab
Fṛansa
Franṣa
language_key: tl
Republikang Pranses
Pransya
Prancia
Pransés
France
Pransia
Republika ng Pransiya
Republika ng Pransya
language_key: bm
Fàransi
language_key: hy
France
Ֆրանսա
Ֆրանսիայի հանրապետություն
Ֆրանսիական
language_key: ilo
Francia
France
Fransia
language_key: yi
פֿֿראַנקרײַך
פֿראַנקרײַך
language_key: vec
Francia
França
Françia
fr
FRA
language_key: rup
Frãntsii
Frantsa
Frãntsia
Frânție
language_key: br
Bro-C'hall
language_key: nah
France
language_key: nan
France
Hoat-lân-se Kiōng-hô-kok
language_key: gl
República Francesa
language_key: mai
फ्रान्स
language_key: an
Franzia
Republica Francesa
Estato francés
Republica Franzesa
Franza
language_key: yo
France
Fransi
Faranse
Fúrànsì
Furansi
language_key: lv
Francijas Republika
Francijas Karaliste
République française
language_key: smn
R

In [169]:
getWikiDataEntry("Q213")

Dict{String, Any} with 12 entries:
  "descriptions" => Dict{String, Any}("cs"=>Dict{String, Any}("language"=>"cs",…
  "aliases"      => Dict{String, Any}("cs"=>Any[Dict{String, Any}("language"=>"…
  "id"           => "Q213"
  "claims"       => Dict{String, Any}("P9037"=>Any[Dict{String, Any}("rank"=>"n…
  "lastrevid"    => 1615181970
  "sitelinks"    => Dict{String, Any}("ttwiki"=>Dict{String, Any}("title"=>"Чех…
  "modified"     => "2022-04-09T13:43:02Z"
  "ns"           => 0
  "labels"       => Dict{String, Any}("mg"=>Dict{String, Any}("language"=>"mg",…
  "pageid"       => 355
  "title"        => "Q213"
  "type"         => "item"

In [168]:
function getWikiDataEntry(id)
    r = HTTP.request("GET", "https://www.wikidata.org/wiki/Special:EntityData/$id.json")
    content = JSON.parse(String(r.body))
    content["entities"][id]
end

getWikiDataEntry (generic function with 1 method)

In [None]:
wikiEntries = map(wikiIDS)

First, we request the current version of the list of countries from https://github.com/mledoze/countries.

In [173]:
r = HTTP.request("GET", "https://raw.githubusercontent.com/mledoze/countries/master/countries.json")
countries_mledoze = JSON.parse(String(r.body))

250-element Vector{Any}:
 Dict{String, Any}("latlng" => Any[12.5, -69.96666666], "languages" => Dict{String, Any}("nld" => "Dutch", "pap" => "Papiamento"), "name" => Dict{String, Any}("common" => "Aruba", "official" => "Aruba", "native" => Dict{String, Any}("nld" => Dict{String, Any}("common" => "Aruba", "official" => "Aruba"), "pap" => Dict{String, Any}("common" => "Aruba", "official" => "Aruba"))), "altSpellings" => Any["AW"], "subregion" => "Caribbean", "status" => "officially-assigned", "unMember" => false, "tld" => Any[".aw"], "independent" => false, "currencies" => Dict{String, Any}("AWG" => Dict{String, Any}("name" => "Aruban florin", "symbol" => "ƒ"))…)
 Dict{String, Any}("latlng" => Any[33, 65], "languages" => Dict{String, Any}("prs" => "Dari", "tuk" => "Turkmen", "pus" => "Pashto"), "name" => Dict{String, Any}("common" => "Afghanistan", "official" => "Islamic Republic of Afghanistan", "native" => Dict{String, Any}("prs" => Dict{String, Any}("common" => "افغانستان", "official"

In [126]:
EXCLUDED_LANGUAGES = ["jpn", "zho"] # Japanese and Chinese
NAME_TYPES = ["official", "common"]
CODES = ["cca2", "ccn3", "cca3", "cioc"]

4-element Vector{String}:
 "cca2"
 "ccn3"
 "cca3"
 "cioc"

In [108]:
function cleanArray(arr)
    unique(vcat(arr...))
end

cleanArray (generic function with 1 method)

In [127]:
function getNames(obj)
    return map(n -> obj[n], NAME_TYPES)
end

getCodes (generic function with 1 method)

In [120]:
function extractFromObject(obj)
    values = []
    for (key, value) in obj
        if !in(key, EXCLUDED_LANGUAGES)
            push!(values, getNames(value))
        end
    end
    return cleanArray(values)
end

extractFromObject (generic function with 1 method)

In [174]:
function processCountry(country)
    variations = cleanArray([
        extractFromObject(country["name"]["native"]),
        extractFromObject(country["translations"]),
        getNames(country["name"]),
        country["altSpellings"]
    ])
    obj = Dict{String,Any}(
        "variations" => variations
    )
    foreach(d -> obj[d] = country["name"][d], NAME_TYPES)
    foreach(d -> obj[d] = country[d], CODES)
    return obj
    # println(obj)
end

countries = map(processCountry, countries_mledoze)

open("countries.json", "w") do f
    JSON.print(f, countries)
end

Dict{Any, Any} with 269 entries:
  "FRA" => "Q142"
  "MAF" => "Q126125"
  "MNG" => "Q711"
  "TKL" => "Q36823"
  "MYT" => "Q17063"
  "VGB" => "Q25305"
  "ISR" => "Q801"
  "SGP" => "Q334"
  "SYC" => "Q1042"
  "HVO" => "Q797422"
  "KIR" => "Q710"
  "BRA" => "Q155"
  "SCG" => "Q37024"
  "VEN" => "Q717"
  "TLS" => "Q574"
  "CPV" => "Q1011"
  "BOL" => "Q750"
  "TWN" => "Q865"
  "GEO" => "Q230"
  "NLD" => "Q29999"
  "KAZ" => "Q232"
  "SDN" => "Q1049"
  "NGA" => "Q1033"
  "PAN" => "Q804"
  "JTN" => "Q131008"
  ⋮     => ⋮