In [1]:
# We are using black as a code formatter
%load_ext lab_black

In [2]:
# importng basic libraries
import requests
import json
import datetime
from pathlib import Path
import os

# importing installed libraries
import pandas as pd

In [3]:
# pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

### API Naciones Unidas

#### The base path for accessing the API is:
base_path ["https://population.un.org/dataportalapi/api/v1"](https://population.un.org/dataportalapi/api/v1)

#### Most common reported status codes
* 200 : Successful request
* 400 : Bad request
* 404 : Input parameters not found
* 406 : Requested output format not allowed
* 500 : Server error

#### Structure of API response (json)

* pageNumber : the current page of the response, which may have multiple pages
* pageSize : the number of records returned on the current page (a maximum of 100 records will be returned)
* previousPage : the path to the previous page of the response when multiple pages are returned
* nextPage : the path to the next page of the response when multiple pages are returned
* pages : the total number of pages in the response
* total : the total number of records in the response
* data : the actual data returned in the response

### Las diferentes fuentes

In [4]:
# Define target URL.
base_url = "https://population.un.org/dataportalapi/api/v1/sources"

# Call the API and convert the resquest into JSON object.
response = requests.get(base_url).json()

# Convert JSON object to data frame.
df = pd.json_normalize(response["data"])

print("df.shape", df.shape, end="\n")

# Display only relevant data.
df[["name", "sourceYear", "startYear", "endYear", "url"]].tail(3)

df.shape (26, 8)


Unnamed: 0,name,sourceYear,startYear,endYear,url
23,World Contraceptive Use 2022,2022,1950,2022,https://www.un.org/development/desa/pd/themes/family-planning
24,World Population Prospects,2022,1950,2100,https://population.un.org/wpp/
25,Estimates and Projections of Women of Reproductive Age Who Are Married or in a Union 2022,2022,1970,2030,https://www.un.org/development/desa/pd/content/fertility-and-marriage-0


### Los topicos

In [5]:
# Define target URL
base_url = "https://population.un.org/dataportalapi/api/v1/topics"

# Call the API and convert the resquest into JSON object
response = requests.get(base_url).json()

# Convert JSON object to data frame
df = pd.json_normalize(response["data"])

print("df.shape", df.shape, end="\n")

# Display only relevant data.
df[["name", "shortName"]]

df.shape (10, 4)


Unnamed: 0,name,shortName
0,Not applicable,
1,Population,Pop
2,Fertility,Fert
3,Mortality,Mort
4,International Migration,iMigration
5,Family Planning,FP
6,Marital Status,MarStat
7,All Components,All
8,Child Mortality,IGME
9,Maternal Mortality,MMEIG


### Los indicadores

In [6]:
# Define target URL.
base_url = "https://population.un.org/dataportalapi/api/v1/indicators"

# Call the API and convert the resquest into JSON object.
response = requests.get(base_url).json()

# Convert JSON object to data frame.
df = pd.json_normalize(response["data"])

print("df.shape", df.shape, end="\n")

# Display only relevant data.
df[["id", "name", "description", "shortName", "topicName", "topicShortName"]]

df.shape (60, 33)


Unnamed: 0,id,name,description,shortName,topicName,topicShortName
0,1,Contraceptive prevalence: Any method (Percent),Percentage of women of reproductive age (15-49 years) who are currently using any method of contraception,CPAnyP,Family Planning,FP
1,2,Contraceptive prevalence: Any modern method (Percent),Percentage of women of reproductive age (15-49 years) who are currently using any modern method of contraception,CPModP,Family Planning,FP
2,3,Contraceptive prevalence: Any traditional method (Percent),Percentage of women of reproductive age (15-49 years) who are currently using any traditional method of contraception,CPTrad,Family Planning,FP
3,4,Unmet need for family planning: Any method (Percent),Percentage of women of reproductive age (15-49 years) who want to stop or delay childbearing but are not using a method of contraception,UNMP,Family Planning,FP
4,5,Unmet need for family planning: Any modern method (Percent),Percentage of women of reproductive age (15-49 years) who want to stop or delay childbearing but are not using a modern method of contraception,UNMModP,Family Planning,FP
5,6,Total demand for family planning (Percent),Percentage of women of reproductive age (15-49 years) who are currently using any method of contraception or are having unmet need for family planning,DEMTot,Family Planning,FP
6,7,Demand for family planning satisfied by any method (Percent),Percentage of women of reproductive age (aged 15-49 years) who have their need for family planning satisfied with any methods,DEMAny,Family Planning,FP
7,8,Demand for family planning satisfied by any modern method (Percent),Percentage of women of reproductive age (aged 15-49 years) who have their need for family planning satisfied with modern methods,DEMMod,Family Planning,FP
8,9,Contraceptive prevalence: Any method (Number),Number of women of reproductive age (15-49 years) who are currently using any method of contraception,CPAnyN,Family Planning,FP
9,10,Contraceptive prevalence: Any modern method (Number),Number of women of reproductive age (15-49 years) who are currently using any modern method of contraception,CPModN,Family Planning,FP


### Areas geograficas

In [7]:
# Base url
base_url = "https://population.un.org/dataportalapi/api/v1"

# Creates the target URL, indicators, in this instance.
target = base_url + "/locations/"

# Get the response, which includes the first page of data as well as information on pagination and number of records.
response = requests.get(target)

# Converts call into JSON.
j = response.json()

# Converts JSON into a pandas DataFrame.
df_locations = pd.json_normalize(
    j["data"]
)  # pd.json_normalize flattens the JSON to accomodate nested lists within the JSON structure.

# Loop until there are new pages with data.
while j["nextPage"] != None:
    # Reset the target to the next page.
    target = j["nextPage"]

    # call the API for the next page.
    response = requests.get(target)

    # Convert response to JSON format.
    j = response.json()

    # Store the next page in a data frame.
    df_temp = pd.json_normalize(j["data"])

    # Append next page to the data frame.
    df_locations = pd.concat([df_locations, df_temp], ignore_index=True)

print("df.shape", df_locations.shape, end="\n")
df_locations.sample(7)

df.shape (284, 6)


Unnamed: 0,id,name,iso3,iso2,longitude,latitude
270,1500,Low-income countries,LIC,XM,,
86,328,Guyana,GUY,GY,-58.93018,4.860416
1,8,Albania,ALB,AL,20.168331,41.153332
230,858,Uruguay,URY,UY,-55.765835,-32.522778
0,4,Afghanistan,AFG,AF,67.709953,33.93911
143,531,Curaçao,CUW,CW,-68.990021,12.16957
142,528,Netherlands,NLD,NL,5.291266,52.132633


### Areas geograficas con información del Banco Mundial

In [10]:
# Define target URL.
base_url = "https://population.un.org/dataportalapi/api/v1/locationsWithAggregates?pageNumber=1"

# Call the API and convert the resquest into JSON object.
response = requests.get(base_url).json()

# Convert JSON object to data frame.
df = pd.json_normalize(response)

# Get the response, which includes the first pages. Only 3.
pages = 3

# Converts call into JSON and concat to the previous data frame.
for page in range(2, pages + 1):
    # Reset the target to the next page
    target = f"https://population.un.org/dataportalapi/api/v1/locationsWithAggregates?pageNumber={page}"

    # Each iteration call the API and convert the resquest into JSON object.
    response = requests.get(target).json()

    # Each iteration convert JSON object to data frame.
    df_temp = pd.json_normalize(response)

    # Each iteration concat the data frames.
    df = pd.concat([df, df_temp], ignore_index=True)

print("df.shape", df.shape, end="\n")
df

# Display only relevant data. (Drop NaN, )
df_copy = (
    df[
        [
            "Id",
            "Name",
            "Iso2",
            "Iso3",
            "Longitude",
            "Latitude",
            "Region",
            "SubRegion",
            "WorldBankIncomeGroup",
            "UNDevelopmentGroup",
        ]
    ]
    .copy()
    .dropna()
)
# df_copy.to_parquet(f"../datasets/df_locationsWithAggregates.parquet")
df_copy.head(5)

df.shape (278, 12)


Unnamed: 0,Id,Name,Iso2,Iso3,Longitude,Latitude,Region,SubRegion,WorldBankIncomeGroup,UNDevelopmentGroup
0,4,Afghanistan,AF,AFG,67.709953,33.93911,Asia,Southern Asia,Low-income countries,Least developed countries
1,8,Albania,AL,ALB,20.168331,41.153332,Europe,Southern Europe,Upper-middle-income countries,Developed regions
2,12,Algeria,DZ,DZA,1.659626,28.033886,Africa,Northern Africa,Lower-middle-income countries,Other developing regions
3,16,American Samoa,AS,ASM,-170.696182,-14.306021,Oceania,Polynesia,Upper-middle-income countries,Other developing regions
4,20,Andorra,AD,AND,1.521801,42.506287,Europe,Southern Europe,High-income countries,Developed regions


### Convertimos los países escogidos en una lista de stringshead

In [72]:
# Stores indicator codes in a list
id_code = [str(code) for code in df_copy["Id"].values]

# Converts indicator code list into string to be used in later API call
id_code_string = ",".join(id_code)
print(len(id_code_string))
id_code_string

879


'4,8,12,16,20,24,28,31,32,36,40,44,48,50,51,52,56,64,68,70,72,76,84,90,92,96,100,104,108,112,116,120,132,136,140,144,148,152,156,158,170,174,175,178,180,184,188,191,192,196,203,204,208,212,214,218,222,226,231,232,233,234,238,242,246,250,254,258,262,266,268,270,275,276,288,292,296,300,308,312,316,320,324,328,332,336,340,344,348,352,356,360,364,368,372,376,380,384,388,392,398,400,404,408,410,414,417,418,422,426,428,430,434,438,440,442,446,450,454,458,462,466,470,474,478,480,484,492,496,498,499,500,504,508,512,516,520,524,528,531,533,534,535,540,548,554,558,562,566,570,578,580,583,584,585,586,591,598,600,604,608,616,620,624,626,630,634,638,642,643,646,654,659,660,662,670,674,678,682,686,688,690,694,702,703,704,705,706,710,716,724,728,729,732,740,748,752,756,760,762,764,768,772,776,780,784,788,792,795,796,798,800,804,807,818,826,833,834,850,854,858,860,862,876,882,887,894'

* <mark>*Locations and Aggregate-locations differs in one country*</mark>
* <mark>*Aggregate-location has more features than Locations*</mark>

In [13]:
# the UN's API includes many entities which are not sovereign states,
# yet are listed as "Country". This workaround is a manuel fix for if
# one is only interested in countries by the classic definition
not_countries = [
    "American Samoa",
    "Bermuda",
    "British Virgin Islands",
    "Cayman Islands",
    "Mayotte",
    "Cook Islands",
    "Faroe Islands",
    "Falkland Islands (Malvinas)",
    "French Guiana",
    "French Polynesia",
    "Gibraltar",
    "Greenland",
    "Guadeloupe",
    "Guam",
    "China, Hong Kong SAR",
    "China, Macao SAR",
    "Martinique",
    "Montserrat",
    "Curaçao",
    "Aruba",
    "Sint Maarten (Dutch part)",
    "Bonaire, Sint Eustatius and Saba",
    "New Caledonia",
    "Niue",
    "Northern Mariana Islands",
    "Puerto Rico",
    "Réunion",
    "Saint Helena",
    "Anguilla",
    "Saint Pierre and Miquelon",
    "Tokelau",
    "Turks and Caicos Islands",
    "Isle of Man",
    "United States Virgin Islands",
    "Wallis and Futuna Islands",
]

La documentación nos brinda una función de ayuda para extraer infomación.

In [14]:
# Define a function that will take a relative path as an input, call the API, and return a dataframe
def callAPI(relative_path: str, topic_list: bool = False) -> pd.DataFrame:
    base_url = "https://population.un.org/dataportalapi/api/v1"
    target = (
        base_url + relative_path
    )  # Query string parameters may be appended here or directly in the provided relative path
    # Calls the API
    response = requests.get(target)
    # Reformats response into a JSON object
    j = response.json()
    # The block below will deal with paginated results.
    # If results not paginated, this will be skipped.
    try:
        # If results are paginated, they are transformed into a python dictionary.
        # The data may be accessed using the 'data' key of the dictionary.
        df = pd.json_normalize(j["data"])
        # As long as the nextPage key of the dictionary contains an address for the next API call, the function will continue to call the API and append the results to the dataframe.
        while j["nextPage"] is not None:
            response = requests.get(j["nextPage"])
            j = response.json()
            df_temp = pd.json_normalize(j["data"])
            df = pd.concat([df, df_temp], ignore_index=True)
    except:
        if topic_list:
            df = pd.json_normalize(j)
        else:
            df = pd.DataFrame(j)
    return df

### Vamos a usar la función para encontrar todos los indicadores de Población

In [15]:
# Uses callAPI function to get a list of Family Planning indicators
df_pop_indicators = callAPI("/topics/Pop/indicators", topic_list=False)
df_pop_indicators[
    ["indicatorId", "indicatorDescription", "sourceStartYear", "sourceEndYear"]
]

Unnamed: 0,indicatorId,indicatorDescription,sourceStartYear,sourceEndYear
0,53,The crude rate of natural change is the ratio of the natural change during the year (live births minus deaths) to the average population in that year. The value is expressed per 1 000 persons.,1950,2100
1,41,Female population of reproductive age (15-49 years),1950,2100
2,67,"Age that divides the population in two parts of equal size, that is, there are as many persons with ages above the median as there are with ages below the median. It is expressed as years.",1950,2100
3,52,"The difference between the number of live births and the number of deaths during the year. A positive natural change, also known as natural increase, occurs when live births outnumber deaths. A negative natural change, also named as natural decrease, occurs when live births are less numerous than deaths.",1950,2100
4,71,"Percentage of Total Population by various functional combination of age groups (0-14, 0-17, primary and secondary school ages, 15-24, 15-49, ..., 18+, 50+, etc.). De facto population as of 1 July of the year indicated. Figures are expressed per 100 population.",1950,2100
5,47,"Annual population by single age and by sex (interpolated data based on 5-year age groups and 5-year periods). De facto population as of 1 July of the year indicated classified by single age (0, 1, 2,.., 99, 100+).",1950,2100
6,46,"Annual population by five-year age groups and by sex (interpolated data based on 5-year periods). De facto population as of 1 July of the year indicated classified by five-year age groups (0-4, 5-9, 10-14,.., 95-99, 100+).",1950,2100
7,70,"De facto population as of 1 July of the year indicated classified by sex (male, female, both sexes combined) and by various functional combination of age groups (0-14, 0-17, primary and secondary school ages, 15-24, 15-49, ..., 18+, 50+, etc.). Data are presented in thousands.",1950,2100
8,50,Difference between the population sizes on 1 January of two consecutive years.,1950,2100
9,54,Number of persons per square Kilometer.,1950,2100


In [16]:
# Stores indicator codes in a list
indicator_pop_codes = [str(code) for code in df_pop_indicators["indicatorId"].values]

# Converts indicator code list into string to be used in later API call
indicator_pop_string = ",".join(indicator_pop_codes)
indicator_pop_string

'53,41,67,52,71,47,46,70,50,54,51,72,49'

### Vamos a usar la función para encontrar todos los indicadores de Mortalidad

In [18]:
# Uses callAPI function to get a list of Family Planning indicators
df_mort_indicators = callAPI("/topics/Mort/indicators", topic_list=False)
df_mort_indicators[
    ["indicatorId", "indicatorDescription", "sourceStartYear", "sourceEndYear"]
]

Unnamed: 0,indicatorId,indicatorDescription,sourceStartYear,sourceEndYear
0,79,"Central death rate for the age interval (x, x+n) where x is the initial age and n is the length of the interval. It is obtained as the ratio of the number of deaths by the number of person-years of exposure of the same age group (for a specified time period).",1950,2100
1,80,Central death rate between ages x and x + 1. It is obtained as the ratio of the number of deaths by the number of person-years of exposure of the same age interval (for a specified time period).,1950,2100
2,59,Number of deaths over a given period divided by the person-years lived by the population over that period.,1950,2100
3,69,Number of deaths by single age and by sex over a given period.,1950,2100
4,64,Number of deaths by age groups and by sex over a given period.,1950,2100
5,61,The average number of years of life expected by a hypothetical cohort of individuals who would be subject throughout their lives to the age-specific mortality rates of a given period.,1950,2100
6,75,"Expectation of life at age x is the average number of years remaining to be lived by those surviving to that age, based on a given set of age-specific rates of dying. It is derived by dividing the total person-years that would be lived beyond age x by the number of persons who survived to that age interval (Tx / lx).",1950,2100
7,76,"Expectation of life at age x is the average number of years remaining to be lived by those surviving to that age, based on a given set of age-specific rates of dying. It is derived by dividing the total person-years that would be lived beyond age x by the number of persons who survived to that age interval (Tx / lx).",1950,2100
8,62,Probability of dying between the fifteen and fiftieth birthdays.,1950,2100
9,63,Probability of dying between the fifteen and sixtieth birthdays.,1950,2100


In [107]:
# Stores indicator codes in a list
indicator_mort_codes = [str(code) for code in df_mort_indicators["indicatorId"].values]

# Converts indicator code list into string to be used in later API call
indicator_mort_string = ",".join(indicator_mort_codes)
indicator_mort_string

'79,80,59,69,64,61,75,76,62,63,81,82,77,78,60'

### Asi tendriamos una funcion para extraer los datos

In [20]:
base_url_UNPD = "https://population.un.org/dataportalapi/api/v1"
country = "4,8"  # set the country code
indicator_code = 60  # set the indicator code <-----ESTO SE CAMBIA
start_year = 1990  # set the start year
end_year = 2020  # set the end year
topick = (
    "mort"  # set the topic to change the parquet name file code <----- ESTO SE CAMBIA
)

# define the target URL
target = (
    base_url_UNPD
    + f"/data/indicators/{indicator_code}/locations/{country}/start/{start_year}/end/{end_year}"
)

response = requests.get(target)  # Call the API
j = response.json()  # Format response as JSON
df_UNPD = pd.json_normalize(j["data"])  # Read JSON data into dataframe

# As long as the response contains information in the 'nextPage' field, the loop will continue to download and append data
while j["nextPage"] is not None:
    response = requests.get(j["nextPage"])
    j = response.json()
    df_temp = pd.json_normalize(j["data"])
    df_UNPD = pd.concat([df_UNPD, df_temp], ignore_index=True)

df_UNPD

# Verifies that the number of records available from API call matches the length of the dataframe
# assert (
#     len(df_UNPD) == j["total"]
# ), "DataFrame observations do not match total number of records in response"

# df_UNPD.to_parquet(f"../datasets/df_UNPD_{topick}_{indicator_code}.parquet")

Unnamed: 0,locationId,location,iso3,iso2,locationTypeId,indicatorId,indicator,indicatorDisplayName,sourceId,source,revision,variantId,variant,variantShortName,variantLabel,timeId,timeLabel,timeMid,categoryId,category,estimateTypeId,estimateType,estimateMethodId,estimateMethod,sexId,sex,ageId,ageLabel,ageStart,ageEnd,ageMid,value
0,4,Afghanistan,AFG,AF,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,41,1990,1990.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,1,Male,188,Total,0,-1,0,110102
1,4,Afghanistan,AFG,AF,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,41,1990,1990.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,2,Female,188,Total,0,-1,0,93412
2,4,Afghanistan,AFG,AF,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,41,1990,1990.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,3,Both sexes,188,Total,0,-1,0,203514
3,4,Afghanistan,AFG,AF,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,42,1991,1991.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,1,Male,188,Total,0,-1,0,104274
4,4,Afghanistan,AFG,AF,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,42,1991,1991.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,2,Female,188,Total,0,-1,0,88257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,8,Albania,ALB,AL,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,70,2019,2019.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,2,Female,188,Total,0,-1,0,10232
182,8,Albania,ALB,AL,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,70,2019,2019.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,3,Both sexes,188,Total,0,-1,0,24410
183,8,Albania,ALB,AL,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,71,2020,2020.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,1,Male,188,Total,0,-1,0,18095
184,8,Albania,ALB,AL,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,71,2020,2020.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,2,Female,188,Total,0,-1,0,12876


Lectura de un archivo parquet

In [21]:
read_data = pd.read_parquet("../../data/datos_brutos/df_TWB_AG.LND.FRST.ZS.parquet")
read_data

Unnamed: 0,countryiso3code,date,value,unit,obs_status,decimal,indicator.id,indicator.value,country.id,country.value
0,AFE,2020,30.174186,,,1,AG.LND.FRST.ZS,Área selvática (% del área de tierra),ZH,
1,AFE,2019,30.391558,,,1,AG.LND.FRST.ZS,Área selvática (% del área de tierra),ZH,
2,AFE,2018,30.611444,,,1,AG.LND.FRST.ZS,Área selvática (% del área de tierra),ZH,
3,AFE,2017,30.824248,,,1,AG.LND.FRST.ZS,Área selvática (% del área de tierra),ZH,
4,AFE,2016,31.039613,,,1,AG.LND.FRST.ZS,Área selvática (% del área de tierra),ZH,
...,...,...,...,...,...,...,...,...,...,...
8241,ZWE,1994,48.190255,,,1,AG.LND.FRST.ZS,Área selvática (% del área de tierra),ZW,Zimbabwe
8242,ZWE,1993,48.309345,,,1,AG.LND.FRST.ZS,Área selvática (% del área de tierra),ZW,Zimbabwe
8243,ZWE,1992,48.428435,,,1,AG.LND.FRST.ZS,Área selvática (% del área de tierra),ZW,Zimbabwe
8244,ZWE,1991,48.547525,,,1,AG.LND.FRST.ZS,Área selvática (% del área de tierra),ZW,Zimbabwe


Revisamos una información básica de los dataframes

In [22]:
directory = "../../data/datos_brutos/"
# Let's iterate each parquet file and get some basic information
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f) and f.endswith(".parquet"):
        print("========" + filename.rstrip(".parquet").upper() + "========", end="\n")
        df = pd.read_parquet(f)
        df.replace("", float("NaN"), inplace=True)
        # print("====Basic info about the dataframe====", end="\n")
        # print(df.info(), end="\n")
        # print("====Sum the null in each column====", end="\n")
        # print(df.isnull().sum(), end="\n")
        # print("====Describe numeric columns====", end="\n")
        # print(df.describe(), end="\n")
        # print("====Get a sample of 5 rows====", end="\n")
        # print(df.sample(5), end="\n")
        # print(end="\n")



In [23]:
df_high_income = df_copy[(df_copy["WorldBankIncomeGroup"] == "High-income countries")]
df_low_income = df_copy[(df_copy["WorldBankIncomeGroup"] == "Low-income countries")]
df_upper_middle_income = df_copy[
    (df_copy["WorldBankIncomeGroup"] == "Upper-middle-income countries")
]
df_lower_middle_income = df_copy[
    (df_copy["WorldBankIncomeGroup"] == "Lower-middle-income countries")
]

Revisamos una información básica de los dataframes

In [24]:
directory = "../../data/datos_brutos/"
# Let's iterate each parquet file and get some basic information
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f) and f.endswith(".parquet"):
        print("========" + filename.rstrip(".parquet").upper() + "========", end="\n")
        df = pd.read_parquet(f)
        df.replace("", float("NaN"), inplace=True)
        # print("====Basic info about the dataframe====", end="\n")
        # print(df.info(), end="\n")
        # print("====Sum the null in each column====", end="\n")
        # print(df.isnull().sum(), end="\n")
        # print("====Describe numeric columns====", end="\n")
        # print(df.describe(), end="\n")
        # print("====Get a sample of 5 rows====", end="\n")
        # print(df.sample(5), end="\n")
        # print(end="\n")



In [25]:
# directory = "../../data/datos_brutos/"
# files = [f for f in os.listdir(directory) if f.endswith(".parquet")]

# dfs = {file: pd.read_parquet(directory + file) for file in files}

# count = []

# for key, value in dfs.items():
#     if key.startswith("df_TWB"):
#         count.append(str(key.rstrip(".parquet").lstrip("df_TWB_")))
#         print(value.head())

# print(count)
# dfs["df_UNPD_mort_24.parquet"].loc[
#     :, dfs["df_UNPD_mort_24.parquet"].columns.str.fullmatch("indicator")
# ].mode()

In [26]:
UDPD = {
    "df_UNPD_mort_22": "tasa_mortalidad_infantil",
    "df_UNPD_pop_54": "densidad_población_por_kilómetro_cuadrado)",
    "df_UNPD_imigrt_65": "migración_neta_total",
    "df_UNPD_pop_49": "población_total_por_sexo",
    "df_UNPD_mort_60": "total_muertes_por_sexo",
    "df_UNPD_pop_53": "tasa_bruta_cambio_natural_población",
    "df_UNPD_imigrt_66": "tasa_bruta_migración_neta",
    "df_UNPD_pop_72": "proporción_sexos_población_total",
    "df_UNPD_fam_1": "prevalencia_anticonceptivos_porcentaje",
    "df_UNPD_pop_67": "mediana_edad_población",
    "df_UNPD_mort_59": "tasa_bruta_mortalidad_por_1000_habitantes",
    "df_UNPD_pop_51": "tasa_bruta_variación_total_población",
    "df_UNPD_pop_50": "cambio_de_la_población",
    "df_UNPD_pop_41": "población_femenina_edad_reproductiva_(15-49 años)",
    "df_UNPD_mort_24": "tasa_mortalidad_menores_cinco_años",
    "df_UNPD_pop_52": "cambio_natural_población",
    "df_UNPD_fert_19": "tasa_fertilidad",
    "df_UNPD_marstat_42": "estado_civil_casado_porcentaje",
}

In [27]:
WB = {
    "SP.DYN.LE00.IN": "esperanza_vida_total",
    "SP.DYN.LE00.FE.IN": "esperanza_vida_mujeres",
    "SP.DYN.LE00.MA.IN": "esperanza_vida_varones",
    "SI.POV.GINI": "índice_gini",
    "SE.XPD.TOTL.GD.ZS": "gasto_púb_educacion_pje",
    "SE.COM.DURS": "duración_educ_obligatoria",
    "NY.GDP.PCAP.CD": "pib_pc_usd_actuales",
    "NY.GDP.MKTP.PP.CD": "pib_ppa_prec_inter",
    "IQ.SCI.OVRL": "capacidad_estadística",
    "SP.POP.TOTL.FE.ZS": "población_mujeres_pje",
    "SP.POP.TOTL.MA.ZS": "población_hombres_pje",
    "NY.GDP.PCAP.PP.CD": "pib_pc_prec_inter",
    "AG.LND.FRST.ZS": "porcentaje_de_bosque",
    "EN.ATM.CO2E.PC": "emisiones_co2",
    "SH.XPD.CHEX.PC.CD": "inversion_salud_percapita",
    "SH.MED.BEDS.ZS": "camas_hospitales_c/1000personas",
    "SP.DYN.IMRT.IN": "mortalidad_infantil_c/1000nacimientos",
    "SH.H2O.BASW.ZS": "acceso_agua_potable(%)",
    "SH.STA.BASS.ZS": "acceso_servicios_sanitarios(%)",
    "SH.STA.SUIC.P5": "tasa_mortalidad_suicidio_c/100.000",
    "SL.UEM.TOTL.ZS": "tasa_desempleo",
    "SP.URB.TOTL.IN.ZS": "tasa_poblacion_urbana",
    "NY.GNP.PCAP.CD": "INB_percapita",
}

In [28]:
OMS = {
    "df_OMS_NUTRITION_ANAEMIA_CHILDREN_PREV": "tasa_anemia_niños(%)",
    "df_OMS_NUTRITION_ANAEMIA_REPRODUCTIVEAGE_PREV": "tasa_anemia_mujeres(%)",
    "df_OMS_M_Est_cig_curr": "tasa_consumo_cigarro(%)",
    "df_OMS_SA_0000001688": "tasa_consumo_alcohol(L)",
    "df_OMS_NCD_BMI_30A": "tasa_obesidad_pob(%)",
}

## Data Lake

![Alt text](https://linuxaria.com/wp-content/uploads/2016/07/gitlfs-768x403.png "a title")

Las APIs usadas para la extraccion son la de las Naciones unidas, Banco Mundial y de la Organizacion de las Naciones Unidas, en cada una de ellas tenenemos una Documentacion especifica de cada enpoint.

Todas las API tienen como puntos principales la fuente de los datos, los topicos como Poblacion, Fertilidad, Mortalidad entre otras y una infinidad de inicadores los cuales nos ayudan a extraer la información de manera rapida y sencilla dando como parametros los paises, y rango en años.

Algunos problemas que surgieron en la extracción de los datos fueron, que la documentacion de algunas API esta desactualizadas y otra fue que algunas aveces el servidor se no daba respuesta.
 
Por tal motivo decidimos usar un git-lfs y usarlo como un data lake para no depender especificamente de la API y guardar los paises preseleccionados dentro del periodo de tiempo de 1990 al 2020 de cada uno de los indicadores en archivos en formato parquet que esta diseñado para admitir esquemas de compresión y codificación muy eficientes al ser tambien formato binario.

De esta manera podemos acceder a los datos de manera local y de manera remota.



In [37]:
pdm = pd.read_parquet("../../data/datos_pre_procesados/paises_del_mundo.parquet")
pdm.rename(
    columns={
        "id": "iso3",
    },
    inplace=True,
)

In [38]:
pdm

Unnamed: 0,iso3,iso2Code,name,longitude,latitude,region.id,region.value
0,ABW,AW,Aruba,-70.0167,12.5167,LCN,América Latina y el Caribe
1,AFG,AF,Afganistán,69.1761,34.5228,SAS,Asia meridional
2,AGO,AO,Angola,13.242,-8.81155,SSF,África al sur del Sahara
3,ALB,AL,Albania,19.8172,41.3317,ECS,Europa y Asia central
4,AND,AD,Andorra,1.5218,42.5075,ECS,Europa y Asia central
...,...,...,...,...,...,...,...
170,USA,US,Estados Unidos,-95.712891,37.0902,NAC,América del Norte
171,UZB,UZ,Uzbekistán,69.269,41.3052,ECS,Europa y Asia central
172,VEN,VE,Venezuela,-69.8371,9.08165,LCN,América Latina y el Caribe
173,YEM,YE,"Yemen, Rep. del",44.2075,15.352,MEA,Oriente Medio y Norte de África


In [39]:
df_locations

Unnamed: 0,id,name,iso3,iso2,longitude,latitude
0,4,Afghanistan,AFG,AF,67.709953,33.939110
1,8,Albania,ALB,AL,20.168331,41.153332
2,12,Algeria,DZA,DZ,1.659626,28.033886
3,16,American Samoa,ASM,AS,-170.696182,-14.306021
4,20,Andorra,AND,AD,1.521801,42.506287
...,...,...,...,...,...,...
279,1833,Northern Africa and Western Asia,NAW,NW,,
280,1834,Australia/New Zealand,ANZ,ZL,,
281,1835,Oceania (excluding Australia and New Zealand),OCA,OZ,,
282,5500,Central Asia,CAS,CJ,,


In [96]:
numbers_of_countries = pdm.merge(df_locations, how="left", on="iso3")
numbers_of_countries

Unnamed: 0,iso3,iso2Code,name_x,longitude_x,latitude_x,region.id,region.value,id,name_y,iso2,longitude_y,latitude_y
0,ABW,AW,Aruba,-70.0167,12.5167,LCN,América Latina y el Caribe,533,Aruba,AW,-69.968338,12.521110
1,AFG,AF,Afganistán,69.1761,34.5228,SAS,Asia meridional,4,Afghanistan,AF,67.709953,33.939110
2,AGO,AO,Angola,13.242,-8.81155,SSF,África al sur del Sahara,24,Angola,AO,17.873886,-11.202692
3,ALB,AL,Albania,19.8172,41.3317,ECS,Europa y Asia central,8,Albania,AL,20.168331,41.153332
4,AND,AD,Andorra,1.5218,42.5075,ECS,Europa y Asia central,20,Andorra,AD,1.521801,42.506287
...,...,...,...,...,...,...,...,...,...,...,...,...
170,USA,US,Estados Unidos,-95.712891,37.0902,NAC,América del Norte,840,United States of America,US,-95.712891,37.090240
171,UZB,UZ,Uzbekistán,69.269,41.3052,ECS,Europa y Asia central,860,Uzbekistan,UZ,64.585258,41.377491
172,VEN,VE,Venezuela,-69.8371,9.08165,LCN,América Latina y el Caribe,862,Venezuela (Bolivarian Republic of),VE,-66.589729,6.423750
173,YEM,YE,"Yemen, Rep. del",44.2075,15.352,MEA,Oriente Medio y Norte de África,887,Yemen,YE,48.516388,15.552727


In [114]:
# Stores indicator codes in a list
numbers_of_countries_list = [str(i) for i in numbers_of_countries["id"].values]

# Converts indicator code list into string to be used in later API call
numbers_of_countries_string = ",".join(numbers_of_countries_list)
numbers_of_countries_string

'533,4,24,8,20,784,32,51,16,28,36,40,31,108,56,204,854,50,100,48,44,70,112,84,60,68,76,52,96,64,72,140,124,756,152,156,384,120,180,178,170,174,132,188,192,136,196,203,276,262,212,208,214,12,218,818,232,724,233,231,246,242,250,234,583,266,826,268,288,324,270,624,226,300,308,304,320,316,328,344,340,191,332,348,360,833,356,372,364,368,352,376,380,388,400,392,398,404,417,116,296,659,410,414,418,422,430,434,662,438,144,426,440,442,428,446,504,498,450,462,484,584,807,466,104,499,496,508,478,480,454,458,516,562,566,558,524,586,604,608,585,598,408,600,643,646,729,686,90,694,222,706,688,728,678,740,748,760,148,768,764,762,795,626,776,788,792,834,800,804,840,860,862,887,894'

In [115]:
base_url_UNPD = "https://population.un.org/dataportalapi/api/v1"
country = numbers_of_countries_string  # set the country code
indicator_code = 60  # set the indicator code <-----ESTO SE CAMBIA
start_year = 1990  # set the start year
end_year = 2020  # set the end year
topick = (
    "mort"  # set the topic to change the parquet name file code <----- ESTO SE CAMBIA
)

# define the target URL
target = (
    base_url_UNPD
    + f"/data/indicators/{indicator_code}/locations/{country}/start/{start_year}/end/{end_year}"
)

response = requests.get(target)  # Call the API
j = response.json()  # Format response as JSON
df_UNPD = pd.json_normalize(j["data"])  # Read JSON data into dataframe

# As long as the response contains information in the 'nextPage' field, the loop will continue to download and append data
while j["nextPage"] is not None:
    response = requests.get(j["nextPage"])
    j = response.json()
    df_temp = pd.json_normalize(j["data"])
    df_UNPD = pd.concat([df_UNPD, df_temp], ignore_index=True)

df_UNPD

# Verifies that the number of records available from API call matches the length of the dataframe
# assert (
#     len(df_UNPD) == j["total"]
# ), "DataFrame observations do not match total number of records in response"

# df_UNPD.to_parquet(f"../datasets/df_UNPD_{topick}_{indicator_code}.parquet")

Unnamed: 0,locationId,location,iso3,iso2,locationTypeId,indicatorId,indicator,indicatorDisplayName,sourceId,source,revision,variantId,variant,variantShortName,variantLabel,timeId,timeLabel,timeMid,categoryId,category,estimateTypeId,estimateType,estimateMethodId,estimateMethod,sexId,sex,ageId,ageLabel,ageStart,ageEnd,ageMid,value
0,4,Afghanistan,AFG,AF,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,41,1990,1990.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,1,Male,188,Total,0,-1,0,110102
1,4,Afghanistan,AFG,AF,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,41,1990,1990.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,2,Female,188,Total,0,-1,0,93412
2,4,Afghanistan,AFG,AF,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,41,1990,1990.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,3,Both sexes,188,Total,0,-1,0,203514
3,4,Afghanistan,AFG,AF,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,42,1991,1991.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,1,Male,188,Total,0,-1,0,104274
4,4,Afghanistan,AFG,AF,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,42,1991,1991.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,2,Female,188,Total,0,-1,0,88257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16270,894,Zambia,ZMB,ZM,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,70,2019,2019.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,2,Female,188,Total,0,-1,0,55091
16271,894,Zambia,ZMB,ZM,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,70,2019,2019.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,3,Both sexes,188,Total,0,-1,0,120730
16272,894,Zambia,ZMB,ZM,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,71,2020,2020.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,1,Male,188,Total,0,-1,0,67851
16273,894,Zambia,ZMB,ZM,4,60,Total deaths by sex,Total deaths by sex,25,World Population Prospects,0,4,Median,Median,Median,71,2020,2020.5,0,Not applicable,1,Model-based Estimates,2,Interpolation,2,Female,188,Total,0,-1,0,57092
