In [35]:
import json
import pandas
import requests
from tqdm.notebook import tqdm as tdqm_notebook

In [36]:
def to_json(json_list, file_path : str):
    json_string = json.dumps(json_list, indent=2)
    json_file = open(file_path, "w+")
    json_file.write(json_string)
    json_file.close()

In [37]:
COUNTRIES_URL = "https://restcountries.com/v3/all"

response = requests.get(COUNTRIES_URL)
countries = response.json()

languages_dict = {}

for country in countries:
    languages = country.get("languages", {})
    languages_dict = { **languages_dict, **languages }

languages_list = []

for key in languages_dict:
    languages_list.append({
        "code": key,
        "name": languages_dict[key]
    })

languages_data_frame = pandas.DataFrame(languages_list)
languages_data_frame = languages_data_frame.sort_values(by = ["code", "name"])
languages_data_frame.to_csv("./csv/languages.csv", index=False, header=True)
to_json(languages_data_frame.to_dict("records"), "./json/languages.json")
languages_data_frame

Unnamed: 0,code,name
58,afr,Afrikaans
53,amh,Amharic
11,ara,Arabic
140,arc,Aramaic
96,aym,Aymara
...,...,...
65,xho,Xhosa
79,zdj,Comorian
51,zho,Chinese
116,zib,Zimbabwean Sign Language


## Languages by Countries

In [38]:
language_countries = []

regions = list(set(list(map(lambda item: item.get("region", "").lower(), countries))))
regions.sort()

for language in tdqm_notebook(languages_list):
    code : str = language.get("code", "")
    name : str = language.get("name", "")
    file_name : str = "-".join(name.split(" ")).lower()
    language_speaking_countries = []
    for country in countries:
        unMember = country.get("unMember", False)
        languages = country.get("languages", {})
        codes = list(languages.keys())
        if unMember and code in codes:
            common_name = country.get("name", {}).get("common", "")
            region = country.get("region", "")
            subregion = country.get("subregion", "")
            population = country.get("population", 0)
            language_speaking_countries.append({
                "country": common_name,
                "region": region,
                "subregion": subregion,
                "population": population
            })
    all_population = 0
    for country in language_speaking_countries:
        all_population += country.get("population", 0)
    language_item = {
        "code": code,
        "language": name,
        "countries": len(language_speaking_countries),
        "population": all_population
    }
    for region in regions:
        language_regions = list(filter(lambda item: item.get("region").lower() == region, language_speaking_countries))
        region_population_column = region + "_population"
        countries_by_region = list(filter(lambda item: item.get("region").lower() == region, countries))
        population = 0
        for country in countries_by_region:
            population += country.get("population", 0)
        language_item = { **language_item, region: len(language_regions), region_population_column: population }
    language_countries.append(language_item)
    if len(language_speaking_countries) <= 1:
        continue
    try:
        language_speaking_countries_data_frame = pandas.DataFrame(language_speaking_countries)
        language_speaking_countries_data_frame = language_speaking_countries_data_frame.sort_values(by = ["region", "subregion", "country"])
        language_speaking_countries_data_frame.to_csv(f"./csv/countries/{file_name}-speaking.csv", index=False, header=True)
        to_json(language_speaking_countries_data_frame.to_dict("records"), f"./json/countries/{file_name}-speaking.json")
    except:
        print(file_name, "error")

language_countries_data_frame = pandas.DataFrame(language_countries)
language_countries_data_frame = language_countries_data_frame.sort_values(by = ["countries", "language"], ascending=[False, True])
language_countries_data_frame.to_csv("./csv/languages-countries.csv", index=False, header=True)
to_json(language_countries_data_frame.to_dict("records"), "./json/languages-countries.json")
language_countries_data_frame

  0%|          | 0/153 [00:00<?, ?it/s]

Unnamed: 0,code,language,countries,population,africa,africa_population,americas,americas_population,antarctic,antarctic_population,asia,asia_population,europe,europe_population,oceania,oceania_population
1,eng,English,59,2870905381,23,1427153322,14,1020976420,0,1430,5,4604594974,3,746934072,14,43119432
5,fra,French,31,573140579,22,1427153322,2,1020976420,0,1430,1,4604594974,5,746934072,1,43119432
11,ara,Arabic,24,463121175,12,1427153322,0,1020976420,0,1430,12,4604594974,0,746934072,0,43119432
0,spa,Spanish,21,466248015,1,1427153322,19,1020976420,0,1430,0,4604594974,1,746934072,0,43119432
18,por,Portuguese,8,290483252,5,1427153322,1,1020976420,0,1430,1,4604594974,1,746934072,0,43119432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,pih,Norfuk,0,0,0,1427153322,0,1020976420,0,1430,0,4604594974,0,746934072,0,43119432
78,nor,Norwegian,0,0,0,1427153322,0,1020976420,0,1430,0,4604594974,0,746934072,0,43119432
88,pap,Papiamento,0,0,0,1427153322,0,1020976420,0,1430,0,4604594974,0,746934072,0,43119432
68,tkl,Tokelauan,0,0,0,1427153322,0,1020976420,0,1430,0,4604594974,0,746934072,0,43119432


## Countries by Languages

In [39]:
countries_languages = []

for country in tdqm_notebook(countries):
    unMember = country.get("unMember", False)
    languages = country.get("languages", {})
    common_name = country.get("name", {}).get("common", "")
    if not unMember:
        continue
    number_of_languages = len(list(languages.keys()))
    countries_languages.append({
        "country": common_name,
        "languages": " - ".join(list(languages.values())),
        "number_of_languages": number_of_languages
    })

countries_languages_data_frame = pandas.DataFrame(countries_languages)
countries_languages_data_frame = countries_languages_data_frame.sort_values(by = ["number_of_languages", "country"], ascending=[False, True])
countries_languages_data_frame.to_csv("./csv/countries-languages.csv", index=False, header=True)
to_json(countries_languages_data_frame.to_dict("records"), "./json/countries-languages.json")
countries_languages_data_frame

  0%|          | 0/250 [00:00<?, ?it/s]

Unnamed: 0,country,languages,number_of_languages
125,Zimbabwe,Chibarwe - English - Kalanga - Khoisan - Ndau ...,15
70,South Africa,Afrikaans - English - Southern Ndebele - North...,11
89,Namibia,Afrikaans - German - English - Herero - Khoekh...,9
133,DR Congo,French - Kikongo - Lingala - Tshiluba - Swahili,5
105,Bolivia,Aymara - Guaraní - Quechua - Spanish,4
...,...,...,...
57,Uruguay,Spanish,1
16,Venezuela,Spanish,1
30,Vietnam,Vietnamese,1
117,Yemen,Arabic,1
