In [23]:
# We are using black as a code formatter
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [24]:
import requests
import json
import datetime
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

<IPython.core.display.Javascript object>

In [25]:
# pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

<IPython.core.display.Javascript object>

In [26]:
# the UN's API includes many entities which are not sovereign states,
# yet are listed as "Country". This workaround is a manuel fix for if
# one is only interested in countries by the classic definition
not_countries = [
    "American Samoa",
    "Bermuda",
    "British Virgin Islands",
    "Cayman Islands",
    "Mayotte",
    "Cook Islands",
    "Faroe Islands",
    "Falkland Islands (Malvinas)",
    "French Guiana",
    "French Polynesia",
    "Gibraltar",
    "Greenland",
    "Guadeloupe",
    "Guam",
    "China, Hong Kong SAR",
    "China, Macao SAR",
    "Martinique",
    "Montserrat",
    "Curaçao",
    "Aruba",
    "Sint Maarten (Dutch part)",
    "Bonaire, Sint Eustatius and Saba",
    "New Caledonia",
    "Niue",
    "Northern Mariana Islands",
    "Puerto Rico",
    "Réunion",
    "Saint Helena",
    "Anguilla",
    "Saint Pierre and Miquelon",
    "Tokelau",
    "Turks and Caicos Islands",
    "Isle of Man",
    "United States Virgin Islands",
    "Wallis and Futuna Islands",
]

<IPython.core.display.Javascript object>

#### The base path for accessing the API is:
base_path ["https://population.un.org/dataportalapi/api/v1"](https://population.un.org/dataportalapi/api/v1)

#### Most common reported status codes
* 200 : Successful request
* 400 : Bad request
* 404 : Input parameters not found
* 406 : Requested output format not allowed
* 500 : Server error

#### Structure of API response (json)

* pageNumber : the current page of the response, which may have multiple pages
* pageSize : the number of records returned on the current page (a maximum of 100 records will be returned)
* previousPage : the path to the previous page of the response when multiple pages are returned
* nextPage : the path to the next page of the response when multiple pages are returned
* pages : the total number of pages in the response
* total : the total number of records in the response
* data : the actual data returned in the response

### Topics

In [27]:
# Define target URL
base_url = "https://population.un.org/dataportalapi/api/v1/topics"

# Call the API and convert the resquest into JSON object
response = requests.get(base_url).json()

# Convert JSON object to data frame
df = pd.json_normalize(response["data"])

print("df.shape", df.shape, end="\n")

# Display only relevant data.
df[["name", "shortName"]]

df.shape (10, 4)


Unnamed: 0,name,shortName
0,Not applicable,
1,Population,Pop
2,Fertility,Fert
3,Mortality,Mort
4,International Migration,iMigration
5,Family Planning,FP
6,Marital Status,MarStat
7,All Components,All
8,Child Mortality,IGME
9,Maternal Mortality,MMEIG


<IPython.core.display.Javascript object>

### Indicators

In [28]:
# Define target URL.
base_url = "https://population.un.org/dataportalapi/api/v1/indicators"

# Call the API and convert the resquest into JSON object.
response = requests.get(base_url).json()

# Convert JSON object to data frame.
df = pd.json_normalize(response["data"])

print("df.shape", df.shape, end="\n")

# Display only relevant data.
df[["id", "name", "description", "shortName", "topicName", "topicShortName"]]

df.shape (60, 33)


Unnamed: 0,id,name,description,shortName,topicName,topicShortName
0,1,Contraceptive prevalence: Any method (Percent),Percentage of women of reproductive age (15-49 years) who are currently using any method of contraception,CPAnyP,Family Planning,FP
1,2,Contraceptive prevalence: Any modern method (Percent),Percentage of women of reproductive age (15-49 years) who are currently using any modern method of contraception,CPModP,Family Planning,FP
2,3,Contraceptive prevalence: Any traditional method (Percent),Percentage of women of reproductive age (15-49 years) who are currently using any traditional method of contraception,CPTrad,Family Planning,FP
3,4,Unmet need for family planning: Any method (Percent),Percentage of women of reproductive age (15-49 years) who want to stop or delay childbearing but are not using a method of contraception,UNMP,Family Planning,FP
4,5,Unmet need for family planning: Any modern method (Percent),Percentage of women of reproductive age (15-49 years) who want to stop or delay childbearing but are not using a modern method of contraception,UNMModP,Family Planning,FP
5,6,Total demand for family planning (Percent),Percentage of women of reproductive age (15-49 years) who are currently using any method of contraception or are having unmet need for family planning,DEMTot,Family Planning,FP
6,7,Demand for family planning satisfied by any method (Percent),Percentage of women of reproductive age (aged 15-49 years) who have their need for family planning satisfied with any methods,DEMAny,Family Planning,FP
7,8,Demand for family planning satisfied by any modern method (Percent),Percentage of women of reproductive age (aged 15-49 years) who have their need for family planning satisfied with modern methods,DEMMod,Family Planning,FP
8,9,Contraceptive prevalence: Any method (Number),Number of women of reproductive age (15-49 years) who are currently using any method of contraception,CPAnyN,Family Planning,FP
9,10,Contraceptive prevalence: Any modern method (Number),Number of women of reproductive age (15-49 years) who are currently using any modern method of contraception,CPModN,Family Planning,FP


<IPython.core.display.Javascript object>

### Geographical areas or locations

In [29]:
# Base url
base_url = "https://population.un.org/dataportalapi/api/v1"

# Creates the target URL, indicators, in this instance.
target = base_url + "/locations/"

# Get the response, which includes the first page of data as well as information on pagination and number of records.
response = requests.get(target)

# Converts call into JSON.
j = response.json()

# Converts JSON into a pandas DataFrame.
df = pd.json_normalize(
    j["data"]
)  # pd.json_normalize flattens the JSON to accomodate nested lists within the JSON structure.

# Loop until there are new pages with data.
while j["nextPage"] != None:
    # Reset the target to the next page.
    target = j["nextPage"]

    # call the API for the next page.
    response = requests.get(target)

    # Convert response to JSON format.
    j = response.json()

    # Store the next page in a data frame.
    df_temp = pd.json_normalize(j["data"])

    # Append next page to the data frame.
    df = pd.concat([df, df_temp], ignore_index=True)
df

Unnamed: 0,id,name,iso3,iso2,longitude,latitude
0,4,Afghanistan,AFG,AF,67.709953,33.939110
1,8,Albania,ALB,AL,20.168331,41.153332
2,12,Algeria,DZA,DZ,1.659626,28.033886
3,16,American Samoa,ASM,AS,-170.696182,-14.306021
4,20,Andorra,AND,AD,1.521801,42.506287
...,...,...,...,...,...,...
272,1833,Northern Africa and Western Asia,NAW,NW,,
273,1834,Australia/New Zealand,ANZ,ZL,,
274,1835,Oceania (excluding Australia and New Zealand),OCA,OZ,,
275,5500,Central Asia,CAS,CJ,,


<IPython.core.display.Javascript object>

### Geographical areas or locations With Aggregates

In [30]:
# Define target URL.
base_url = "https://population.un.org/dataportalapi/api/v1/locationsWithAggregates?pageNumber=1"

# Call the API and convert the resquest into JSON object.
response = requests.get(base_url).json()

# Convert JSON object to data frame.
df = pd.json_normalize(response)

# Get the response, which includes the first pages. Only 3.
pages = 3

# Converts call into JSON and concat to the previous data frame.
for page in range(2, pages + 1):
    # Reset the target to the next page
    target = f"https://population.un.org/dataportalapi/api/v1/locationsWithAggregates?pageNumber={page}"

    # Each iteration call the API and convert the resquest into JSON object.
    response = requests.get(target).json()

    # Each iteration convert JSON object to data frame.
    df_temp = pd.json_normalize(response)

    # Each iteration concat the data frames.
    df = pd.concat([df, df_temp], ignore_index=True)

print("df.shape", df.shape, end="\n")
df

# Display only relevant data. (Drop NaN, )
df_copy = (
    df[
        [
            "Id",
            "Name",
            "Iso2",
            "Iso3",
            "Longitude",
            "Latitude",
            "Region",
            "SubRegion",
            "WorldBankIncomeGroup",
            "UNDevelopmentGroup",
        ]
    ]
    .copy()
    .dropna()
)
df_copy

df.shape (278, 12)


Unnamed: 0,Id,Name,Iso2,Iso3,Longitude,Latitude,Region,SubRegion,WorldBankIncomeGroup,UNDevelopmentGroup
0,4,Afghanistan,AF,AFG,67.709953,33.939110,Asia,Southern Asia,Low-income countries,Least developed countries
1,8,Albania,AL,ALB,20.168331,41.153332,Europe,Southern Europe,Upper-middle-income countries,Developed regions
2,12,Algeria,DZ,DZA,1.659626,28.033886,Africa,Northern Africa,Lower-middle-income countries,Other developing regions
3,16,American Samoa,AS,ASM,-170.696182,-14.306021,Oceania,Polynesia,Upper-middle-income countries,Other developing regions
4,20,Andorra,AD,AND,1.521801,42.506287,Europe,Southern Europe,High-income countries,Developed regions
...,...,...,...,...,...,...,...,...,...,...
232,862,Venezuela (Bolivarian Republic of),VE,VEN,-66.589729,6.423750,Latin America and the Caribbean,South America,No income group available,Other developing regions
233,876,Wallis and Futuna Islands,WF,WLF,-178.116501,-14.293800,Oceania,Polynesia,No income group available,Other developing regions
234,882,Samoa,WS,WSM,-172.104630,-13.759029,Oceania,Polynesia,Lower-middle-income countries,Other developing regions
235,887,Yemen,YE,YEM,48.516388,15.552727,Asia,Western Asia,Low-income countries,Least developed countries


<IPython.core.display.Javascript object>

In [31]:
# Stores indicator codes in a list
id_code = [str(code) for code in df_copy["Id"].values]

# Converts indicator code list into string to be used in later API call
id_code_string = ",".join(id_code)
id_code_string

'4,8,12,16,20,24,28,31,32,36,40,44,48,50,51,52,56,64,68,70,72,76,84,90,92,96,100,104,108,112,116,120,132,136,140,144,148,152,156,158,170,174,175,178,180,184,188,191,192,196,203,204,208,212,214,218,222,226,231,232,233,234,238,242,246,250,254,258,262,266,268,270,275,276,288,292,296,300,308,312,316,320,324,328,332,336,340,344,348,352,356,360,364,368,372,376,380,384,388,392,398,400,404,408,410,414,417,418,422,426,428,430,434,438,440,442,446,450,454,458,462,466,470,474,478,480,484,492,496,498,499,500,504,508,512,516,520,524,528,531,533,534,535,540,548,554,558,562,566,570,578,580,583,584,585,586,591,598,600,604,608,616,620,624,626,630,634,638,642,643,646,654,659,660,662,670,674,678,682,686,688,690,694,702,703,704,705,706,710,716,724,728,729,732,740,748,752,756,760,762,764,768,772,776,780,784,788,792,795,796,798,800,804,807,818,826,833,834,850,854,858,860,862,876,882,887,894'

<IPython.core.display.Javascript object>

Maybe if we want to filter by WorldBankIncomeGroup (This step is not required.)

In [32]:
df_high_income = df_copy[(df_copy["WorldBankIncomeGroup"] == "High-income countries")]
df_low_income = df_copy[(df_copy["WorldBankIncomeGroup"] == "Low-income countries")]
df_upper_middle_income = df_copy[
    (df_copy["WorldBankIncomeGroup"] == "Upper-middle-income countries")
]
df_lower_middle_income = df_copy[
    (df_copy["WorldBankIncomeGroup"] == "Lower-middle-income countries")
]

<IPython.core.display.Javascript object>

* <mark>*Locations and Aggregate-locations differs in one country*</mark>
* <mark>*Aggregate-location has more features than Locations*</mark>

### Sources

In [33]:
# Define target URL.
base_url = "https://population.un.org/dataportalapi/api/v1/sources"

# Call the API and convert the resquest into JSON object.
response = requests.get(base_url).json()

# Convert JSON object to data frame.
df = pd.json_normalize(response["data"])

print("df.shape", df.shape, end="\n")

# Display only relevant data.
df[["name", "sourceYear", "startYear", "endYear", "url"]]

df.shape (26, 8)


Unnamed: 0,name,sourceYear,startYear,endYear,url
0,World Urbanization Prospects,2017,1950,2017,https://population.un.org/wup/
1,World Population Prospects,2018,1950,2018,https://population.un.org/wpp/
2,World Contraceptive Use,2017,1970,2017,http://www.un.org/en/development/desa/population/publications/dataset/contraception/wcu2017.shtml
3,Estimates and Projections of Family Planning Indicators,2017,1970,2030,http://www.un.org/en/development/desa/population/theme/family-planning/cp_model.shtml
4,World Fertility Data,2017,1950,2017,http://www.un.org/en/development/desa/population/publications/dataset/fertility/wfd2015.shtml
5,World Marriage Data,2017,1950,2017,http://www.un.org/en/development/desa/population/theme/marriage-unions/WMD2015.shtml
6,Estimates and Projections of the Number of Women Aged 15-49 Who Are Married or in a Union,2015,1970,2030,http://www.un.org/en/development/desa/population/theme/marriage-unions/marriage_estimates.shtml
7,Levels & Trends in Child Mortality Report 2015,2015,1990,2015,http://childmortality.org/
8,Trends in maternal mortality: 1990 to 2015,2015,1990,2015,http://www.who.int/reproductivehealth/publications/monitoring/maternal-mortality-2015/en/
9,Levels & Trends in Child Mortality Report 2018,2018,1990,2017,http://childmortality.org/


<IPython.core.display.Javascript object>

### Getting informations

Let's create a helper function that takes a relative path as a parameter and returns a dataframe.

In [34]:
# Define a function that will take a relative path as an input, call the API, and return a dataframe
def callAPI(relative_path: str, topic_list: bool = False) -> pd.DataFrame:
    base_url = "https://population.un.org/dataportalapi/api/v1"
    target = (
        base_url + relative_path
    )  # Query string parameters may be appended here or directly in the provided relative path
    # Calls the API
    response = requests.get(target)
    # Reformats response into a JSON object
    j = response.json()
    # The block below will deal with paginated results.
    # If results not paginated, this will be skipped.
    try:
        # If results are paginated, they are transformed into a python dictionary.
        # The data may be accessed using the 'data' key of the dictionary.
        df = pd.json_normalize(j["data"])
        # As long as the nextPage key of the dictionary contains an address for the next API call, the function will continue to call the API and append the results to the dataframe.
        while j["nextPage"] is not None:
            response = requests.get(j["nextPage"])
            j = response.json()
            df_temp = pd.json_normalize(j["data"])
            df = pd.concat([df, df_temp], ignore_index=True)
    except:
        if topic_list:
            df = pd.json_normalize(j)
        else:
            df = pd.DataFrame(j)
    return df

<IPython.core.display.Javascript object>

### Let's get all the the indicators from **population**

In [35]:
# Uses callAPI function to get a list of Family Planning indicators
df_pop_indicators = callAPI("/topics/Pop/indicators", topic_list=False)
df_pop_indicators[
    ["indicatorId", "indicatorDescription", "sourceStartYear", "sourceEndYear"]
]

Unnamed: 0,indicatorId,indicatorDescription,sourceStartYear,sourceEndYear
0,53,The crude rate of natural change is the ratio of the natural change during the year (live births minus deaths) to the average population in that year. The value is expressed per 1 000 persons.,1950,2100
1,41,Female population of reproductive age (15-49 years),1950,2100
2,67,"Age that divides the population in two parts of equal size, that is, there are as many persons with ages above the median as there are with ages below the median. It is expressed as years.",1950,2100
3,52,"The difference between the number of live births and the number of deaths during the year. A positive natural change, also known as natural increase, occurs when live births outnumber deaths. A negative natural change, also named as natural decrease, occurs when live births are less numerous than deaths.",1950,2100
4,71,"Percentage of Total Population by various functional combination of age groups (0-14, 0-17, primary and secondary school ages, 15-24, 15-49, ..., 18+, 50+, etc.). De facto population as of 1 July of the year indicated. Figures are expressed per 100 population.",1950,2100
5,47,"Annual population by single age and by sex (interpolated data based on 5-year age groups and 5-year periods). De facto population as of 1 July of the year indicated classified by single age (0, 1, 2,.., 99, 100+).",1950,2100
6,46,"Annual population by five-year age groups and by sex (interpolated data based on 5-year periods). De facto population as of 1 July of the year indicated classified by five-year age groups (0-4, 5-9, 10-14,.., 95-99, 100+).",1950,2100
7,70,"De facto population as of 1 July of the year indicated classified by sex (male, female, both sexes combined) and by various functional combination of age groups (0-14, 0-17, primary and secondary school ages, 15-24, 15-49, ..., 18+, 50+, etc.). Data are presented in thousands.",1950,2100
8,50,Difference between the population sizes on 1 January of two consecutive years.,1950,2100
9,54,Number of persons per square Kilometer.,1950,2100


<IPython.core.display.Javascript object>

In [36]:
# Stores indicator codes in a list
indicator_pop_codes = [str(code) for code in df_pop_indicators["indicatorId"].values]

# Converts indicator code list into string to be used in later API call
indicator_pop_string = ",".join(indicator_pop_codes)
indicator_pop_string

'53,41,67,52,71,47,46,70,50,54,51,72,49'

<IPython.core.display.Javascript object>

### Let's get all the the indicators from **Mortality**

In [37]:
# Uses callAPI function to get a list of Family Planning indicators
df_mort_indicators = callAPI("/topics/Mort/indicators", topic_list=False)
df_mort_indicators[
    ["indicatorId", "indicatorDescription", "sourceStartYear", "sourceEndYear"]
]

Unnamed: 0,indicatorId,indicatorDescription,sourceStartYear,sourceEndYear
0,79,"Central death rate for the age interval (x, x+n) where x is the initial age and n is the length of the interval. It is obtained as the ratio of the number of deaths by the number of person-years of exposure of the same age group (for a specified time period).",1950,2100
1,80,Central death rate between ages x and x + 1. It is obtained as the ratio of the number of deaths by the number of person-years of exposure of the same age interval (for a specified time period).,1950,2100
2,59,Number of deaths over a given period divided by the person-years lived by the population over that period.,1950,2100
3,69,Number of deaths by single age and by sex over a given period.,1950,2100
4,64,Number of deaths by age groups and by sex over a given period.,1950,2100
5,61,The average number of years of life expected by a hypothetical cohort of individuals who would be subject throughout their lives to the age-specific mortality rates of a given period.,1950,2100
6,75,"Expectation of life at age x is the average number of years remaining to be lived by those surviving to that age, based on a given set of age-specific rates of dying. It is derived by dividing the total person-years that would be lived beyond age x by the number of persons who survived to that age interval (Tx / lx).",1950,2100
7,76,"Expectation of life at age x is the average number of years remaining to be lived by those surviving to that age, based on a given set of age-specific rates of dying. It is derived by dividing the total person-years that would be lived beyond age x by the number of persons who survived to that age interval (Tx / lx).",1950,2100
8,62,Probability of dying between the fifteen and fiftieth birthdays.,1950,2100
9,63,Probability of dying between the fifteen and sixtieth birthdays.,1950,2100


<IPython.core.display.Javascript object>

In [38]:
# Stores indicator codes in a list
indicator_mort_codes = [str(code) for code in df_mort_indicators["indicatorId"].values]

# Converts indicator code list into string to be used in later API call
indicator_mort_string = ",".join(indicator_mort_codes)
indicator_mort_string

'79,80,59,69,64,61,75,76,62,63,81,82,77,78,60'

<IPython.core.display.Javascript object>

### Let's get the info base on the pop indicators and countries

In [41]:
base_url_UNPD = "https://population.un.org/dataportalapi/api/v1"
country = id_code_string  # set the country code
indicator_code = 82  # set the indicator code <-----ESTO SE CAMBIA
start_year = 1990  # set the start year
end_year = 2020  # set the end year
topick = (
    "mort"  # set the topic to change the parquet name file code <----- ESTO SE CAMBIA
)

# define the target URL
target = (
    base_url_UNPD
    + f"/data/indicators/{indicator_code}/locations/{country}/start/{start_year}/end/{end_year}"
)

response = requests.get(target)  # Call the API
j = response.json()  # Format response as JSON
df_UNPD = pd.json_normalize(j["data"])  # Read JSON data into dataframe

# As long as the response contains information in the 'nextPage' field, the loop will continue to download and append data
while j["nextPage"] is not None:
    response = requests.get(j["nextPage"])
    j = response.json()
    df_temp = pd.json_normalize(j["data"])
    df_UNPD = pd.concat([df_UNPD, df_temp], ignore_index=True)

# Verifies that the number of records available from API call matches the length of the dataframe
# assert (
#     len(df_UNPD) == j["total"]
# ), "DataFrame observations do not match total number of records in response"

df_UNPD.to_parquet(f"../datasets/df_UNPD_{topick}_{indicator_code}.parquet")

KeyboardInterrupt: 

<IPython.core.display.Javascript object>

To read the data inside the parquet file

In [140]:
read_data = pd.read_parquet("../datasets/df_UNPD_51.parquet")

Unnamed: 0,locationId,location,iso3,iso2,locationTypeId,indicatorId,indicator,indicatorDisplayName,sourceId,source,revision,variantId,variant,variantShortName,variantLabel,timeId,timeLabel,timeMid,categoryId,category,estimateTypeId,estimateType,estimateMethodId,estimateMethod,sexId,sex,ageId,ageLabel,ageStart,ageEnd,ageMid,value
0,4,Afghanistan,AFG,AF,4,51,Rate of population change,Crude rate of total population change,25,World Population Prospects,0,4,Median,Median,Median,41,1990,1990.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,3,Both sexes,188,Total,0,-1,0,-2.826
1,4,Afghanistan,AFG,AF,4,51,Rate of population change,Crude rate of total population change,25,World Population Prospects,0,4,Median,Median,Median,42,1991,1991.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,3,Both sexes,188,Total,0,-1,0,3.751
2,4,Afghanistan,AFG,AF,4,51,Rate of population change,Crude rate of total population change,25,World Population Prospects,0,4,Median,Median,Median,43,1992,1992.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,3,Both sexes,188,Total,0,-1,0,18.477
3,4,Afghanistan,AFG,AF,4,51,Rate of population change,Crude rate of total population change,25,World Population Prospects,0,4,Median,Median,Median,44,1993,1993.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,3,Both sexes,188,Total,0,-1,0,11.948
4,4,Afghanistan,AFG,AF,4,51,Rate of population change,Crude rate of total population change,25,World Population Prospects,0,4,Median,Median,Median,45,1994,1994.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,3,Both sexes,188,Total,0,-1,0,7.979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7001,894,Zambia,ZMB,ZM,4,51,Rate of population change,Crude rate of total population change,25,World Population Prospects,0,4,Median,Median,Median,67,2016,2016.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,3,Both sexes,188,Total,0,-1,0,3.132
7002,894,Zambia,ZMB,ZM,4,51,Rate of population change,Crude rate of total population change,25,World Population Prospects,0,4,Median,Median,Median,68,2017,2017.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,3,Both sexes,188,Total,0,-1,0,3.095
7003,894,Zambia,ZMB,ZM,4,51,Rate of population change,Crude rate of total population change,25,World Population Prospects,0,4,Median,Median,Median,69,2018,2018.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,3,Both sexes,188,Total,0,-1,0,3.029
7004,894,Zambia,ZMB,ZM,4,51,Rate of population change,Crude rate of total population change,25,World Population Prospects,0,4,Median,Median,Median,70,2019,2019.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,3,Both sexes,188,Total,0,-1,0,2.986


<IPython.core.display.Javascript object>