# Data compiler

In [144]:
using JSON
using CSV
#using Downloads
using DataFrames
using HTTP

# Wikidata data

We request data from Wikidata to enrich our dataset.
Inspired by this [post](https://github.com/datasets/country-codes/issues/53#issuecomment-338925597):

https://query.wikidata.org/#SELECT%20%2a%20%0AWHERE%20%7B%0A%20%20%3Fitem%20wdt%3AP298%20%3Fcode%0A%7D%20ORDER%20BY%20%3Fcode

```sql
SELECT * 
WHERE {
  ?item wdt:P298 ?code
} ORDER BY ?code
```

This gives us a list of all entries that have the `P298` which is the [ISO 3166-1 alpha-3 code](https://www.wikidata.org/wiki/Property:P298) (You can find out the id of the property by looking at the url: `https://www.wikidata.org/wiki/Property:P298` of the property). This gives us a list of all entities that have such a code, which should only be countries)

We download the list as csv file. You find this file in this directory as `wiki data urls.csv`.

Next, we read this file and request all the urls. The result of each item is then stored in a dictionary with the ISO codes as keys.

In [168]:
function getWikiDataEntry(id)
    r = HTTP.request("GET", "https://www.wikidata.org/wiki/Special:EntityData/$id.json")
    content = JSON.parse(String(r.body))
    content["entities"][id]
end

getWikiDataEntry (generic function with 1 method)

In [169]:
getWikiDataEntry("Q213")

Dict{String, Any} with 12 entries:
  "descriptions" => Dict{String, Any}("cs"=>Dict{String, Any}("language"=>"cs",…
  "aliases"      => Dict{String, Any}("cs"=>Any[Dict{String, Any}("language"=>"…
  "id"           => "Q213"
  "claims"       => Dict{String, Any}("P9037"=>Any[Dict{String, Any}("rank"=>"n…
  "lastrevid"    => 1615181970
  "sitelinks"    => Dict{String, Any}("ttwiki"=>Dict{String, Any}("title"=>"Чех…
  "modified"     => "2022-04-09T13:43:02Z"
  "ns"           => 0
  "labels"       => Dict{String, Any}("mg"=>Dict{String, Any}("language"=>"mg",…
  "pageid"       => 355
  "title"        => "Q213"
  "type"         => "item"

In [170]:
wikidata = CSV.read("wiki data urls.csv", DataFrame)
wikiIDS = Dict()

for row in eachrow(urls)
    id = replace(row["item"], "http://www.wikidata.org/entity/" => "")
    wikiIDS[row["code"]] = getWikiDataEntry(id)
end

wikiIDS

Dict{Any, Any} with 269 entries:
  "FRA" => Dict{String, Any}("descriptions"=>Dict{String, Any}("scn"=>Dict{Stri…
  "MAF" => Dict{String, Any}("descriptions"=>Dict{String, Any}("eo"=>Dict{Strin…
  "MNG" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{Strin…
  "TKL" => Dict{String, Any}("descriptions"=>Dict{String, Any}("eo"=>Dict{Strin…
  "MYT" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{Strin…
  "VGB" => Dict{String, Any}("descriptions"=>Dict{String, Any}("eo"=>Dict{Strin…
  "ISR" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{Strin…
  "SGP" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{Strin…
  "SYC" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{Strin…
  "HVO" => Dict{String, Any}("descriptions"=>Dict{String, Any}("eu"=>Dict{Strin…
  "KIR" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{Strin…
  "BRA" => Dict{String, Any}("descriptions"=>Dict{String, Any}("cs"=>Dict{St

We write this dictionary to a json file.

In [175]:
open("countries_wiki.json", "w") do f
    JSON.print(f, wikiIDS)
end

We are interested in the **aliases** of each entry for some languages.

In [206]:
USED_LANGUAGES = ("en", "en-gb", "en-ca", "de", "de-at", "de-ch", "fr", "es", "it")

("en", "en-gb", "en-ca", "de", "de-at", "de-ch", "fr", "es", "it")

In [214]:
function getAliasList(obj)
    list = []
    for (language_key, language) in obj
        if (language_key in USED_LANGUAGES)
            for item in language
                push!(list, item["value"])
            end
        end
    end
    cleanArray(list)
end

getAliasList(wikiIDS["FRA"]["aliases"])

19-element Vector{String}:
 "FR"
 "République française"
 "FRA"
 "RF"
 "fr"
 "la République française"
 "Fr."
 "La France"
 "L'Hexagone"
 "Fille aînée de l'Église"
 "Pays de Molière"
 "República Francesa"
 "Republica Francesa"
 "Republic of France"
 "French Republic"
 "the Hexagon"
 "oltralpe"
 "Französische Republik"
 "Republik Frankreich"

We are also interested in the **labels** of each entry for some languages.

In [212]:
function getLabelList(obj)
    list = []
    for (language_key, item) in obj
        if (language_key in USED_LANGUAGES)
            push!(list, item["value"])
        end
    end
    cleanArray(list)
end

getLabelList(wikiIDS["FRA"]["labels"])

3-element Vector{String}:
 "Frankreich"
 "Francia"
 "France"

In [None]:
wikiEntries = map(wikiIDS)

First, we request the current version of the list of countries from https://github.com/mledoze/countries.

In [173]:
r = HTTP.request("GET", "https://raw.githubusercontent.com/mledoze/countries/master/countries.json")
countries_mledoze = JSON.parse(String(r.body))

250-element Vector{Any}:
 Dict{String, Any}("latlng" => Any[12.5, -69.96666666], "languages" => Dict{String, Any}("nld" => "Dutch", "pap" => "Papiamento"), "name" => Dict{String, Any}("common" => "Aruba", "official" => "Aruba", "native" => Dict{String, Any}("nld" => Dict{String, Any}("common" => "Aruba", "official" => "Aruba"), "pap" => Dict{String, Any}("common" => "Aruba", "official" => "Aruba"))), "altSpellings" => Any["AW"], "subregion" => "Caribbean", "status" => "officially-assigned", "unMember" => false, "tld" => Any[".aw"], "independent" => false, "currencies" => Dict{String, Any}("AWG" => Dict{String, Any}("name" => "Aruban florin", "symbol" => "ƒ"))…)
 Dict{String, Any}("latlng" => Any[33, 65], "languages" => Dict{String, Any}("prs" => "Dari", "tuk" => "Turkmen", "pus" => "Pashto"), "name" => Dict{String, Any}("common" => "Afghanistan", "official" => "Islamic Republic of Afghanistan", "native" => Dict{String, Any}("prs" => Dict{String, Any}("common" => "افغانستان", "official"

In [216]:
EXCLUDED_LANGUAGES = ["jpn", "zho", "kor", "per", "rus", "urd", "pus", ] # Japanese and Chinese
NAME_TYPES = ["official", "common"]
CODES = ["cca2", "ccn3", "cca3", "cioc"]

4-element Vector{String}:
 "cca2"
 "ccn3"
 "cca3"
 "cioc"

In [247]:
function cleanArray(arr)
    unique(map(d -> lowercase(d), vcat(arr...)))
end

cleanArray (generic function with 1 method)

In [127]:
function getNames(obj)
    return map(n -> obj[n], NAME_TYPES)
end

getCodes (generic function with 1 method)

In [120]:
function extractFromObject(obj)
    values = []
    for (key, value) in obj
        if !in(key, EXCLUDED_LANGUAGES)
            push!(values, getNames(value))
        end
    end
    return cleanArray(values)
end

extractFromObject (generic function with 1 method)

In [256]:
function deleteCodes(variations, country)
    for code in CODES
        label = lowercase(country[code])
        index = findfirst(isequal(label), variations)
        if (!isnothing(index))
            # println("found $label at $index")
            # println(variations)
            deleteat!(variations, index) 
            # println(variations)
        end
    end
    variations
end

deleteCodes (generic function with 2 methods)

In [265]:
function processCountry((index, country))
    variations = cleanArray([
        extractFromObject(country["name"]["native"]),
        extractFromObject(country["translations"]),
        getNames(country["name"]),
        country["altSpellings"]
    ])
    if (haskey(wikiIDS, country["cca3"]))
        append!(variations, getLabelList(wikiIDS[country["cca3"]]["labels"]))
        append!(variations, getAliasList(wikiIDS[country["cca3"]]["aliases"]))
    end
    variations = cleanArray(variations)
    variations = deleteCodes(variations, country)
    obj = Dict{String,Any}(
        "i" => index - 1,
        "variations" => variations
    )
    foreach(d -> obj[d] = country["name"][d], NAME_TYPES)
    foreach(d -> obj[d] = country[d], CODES)
    # println(findfirst(isequal(country["cca2"]), variations))
    # println(country["cca3"])
    return obj
    # println(obj)
end

countries = map(processCountry, enumerate(countries_mledoze))

open("countries_with_wikidata.json", "w") do f
    JSON.print(f, countries, 2)
end

Dict{Any, Any} with 269 entries:
  "FRA" => "Q142"
  "MAF" => "Q126125"
  "MNG" => "Q711"
  "TKL" => "Q36823"
  "MYT" => "Q17063"
  "VGB" => "Q25305"
  "ISR" => "Q801"
  "SGP" => "Q334"
  "SYC" => "Q1042"
  "HVO" => "Q797422"
  "KIR" => "Q710"
  "BRA" => "Q155"
  "SCG" => "Q37024"
  "VEN" => "Q717"
  "TLS" => "Q574"
  "CPV" => "Q1011"
  "BOL" => "Q750"
  "TWN" => "Q865"
  "GEO" => "Q230"
  "NLD" => "Q29999"
  "KAZ" => "Q232"
  "SDN" => "Q1049"
  "NGA" => "Q1033"
  "PAN" => "Q804"
  "JTN" => "Q131008"
  ⋮     => ⋮