# High Resolution Population Density Maps

**Objetive:** <br>
The following notebook shows the step-by-step to query and adjust the population layers from Meta and CIESIN pubished in the [HDX](https://data.humdata.org/organization/facebook?q=high%20resolution%20population%20density&ext_page_size=100) data for good portal. <br><br>

Author: Laura Goyeneche, Consultant SPH, lauragoy@iadb.org <br>
Created: March 20, 2023

## 1. Basics

In [1]:
%%capture
# Libraries
import os 
import re
import time
import dotenv
import requests
import numpy as np
import pandas as pd
import geopandas as gpd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from geopandas.tools import sjoin

In [2]:
# Working environments
dotenv.load_dotenv()
sclbucket   = os.environ.get("sclbucket")
scldatalake = os.environ.get("scldatalake")

## 2. Inputs

### 2.1. Country names

In [44]:
# Import country names
file  = "Manuals and Standards/IADB country and area codes for statistical use/IADB_country_codes_admin_0.xlsx"
path  = scldatalake + file
data  = pd.read_excel(path, engine='openpyxl')

# Select rows/columns of interest
data = data[~data.iadb_region_code.isna()]
data = data[['isoalpha3','country_name_es']]

# Replace values
data['country_name_en'] = data.country_name_es.str.normalize('NFKD')
data.country_name_en    = data.country_name_en.str.encode('ascii', errors = 'ignore')
data.country_name_en    = data.country_name_en.str.decode('utf-8')

# Replace country names
country_ = {"Belice":"Belize",
            "Bolivia (Estado Plurinacional de)":"Bolivia",
            "Brasil":"Brazil",
            "Venezuela (Republica Bolivariana de)":"Venezuela","Republica Dominicana":
            "Dominican Republic","Trinidad y Tabago":
            "Trinidad and Tobago"}
data.country_name_en = data.country_name_en.replace(country_)

# Sort dataset
data = data.sort_values(by = "isoalpha3")
codes = data.reset_index(drop = True)

### 2.2. Population

To query a country dataset selected the [iso-alpha3 country code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3) and select the population group of interest `total_population`, `women`, `men`, `children_under_five`, `youth_15_24`, `elderly_60_plus`, and `women_of_reproductive_age_15_49`. 

In [68]:
# Get latest population density maps name files 
# Request HTML content
code     = "COL"
geo      = codes[codes.isoalpha3 == code].country_name_en.values[0].lower().replace(" ","-")
url      = f"https://data.humdata.org/dataset/{geo}-high-resolution-population-density-maps-demographic-estimates"
response = requests.get(url)
html     = response.content
soup     = BeautifulSoup(html, "html5lib")

# Find file names
soup     = soup.find_all('ul', attrs = {"class":"hdx-bs3 resource-list"})
soup     = soup[0].find_all('li', attrs = {"class":"resource-item"})
url      = [item.find_all('div', attrs = {"class":"hdx-btn-group hdx-btn-group-fixed"})[0].find('a')['href'] for item in soup]

In [5]:
# Processing URL
url = [item for item in url if "csv" in item]
url = [f"https://data.humdata.org{item}" for item in url]

In [6]:
# Processing file names
files = []
for item in url: 
    item_ = item.split("/")[-1]
    item_ = item_.replace("_csv",".csv")
    item_ = item_.replace(".zip",".gz")
    item_ = re.sub("(_|-)\d+", "", item_)
    item_ = item_.replace("population","total_population")
    item_ = item_.replace("general","total_population")
    item_ = f"{code}_{item_}"
    item_ = item_.replace(f"{code.lower()}_","")
    item_ = item_.replace(f"_{code.lower()}","")
    item_ = item_.replace("elderly_plus","elderly_60_plus")
    item_ = item_.replace("youth","youth_15_24")
    item_ = item_.replace("women_of_reproductive_age","women_of_reproductive_age_15_49")
    
    files.append(item_)

In [7]:
# Import data
# Select group of interest
group = "total_population"
item  = [item for item in files if group in item][0]
path  = url[files.index(item)] 
pop   = pd.read_csv(path)

In [8]:
# Keep variables of interest
# Keep most recent population estimation 
temp = [name for name in pop.columns if "lat" not in name and "lon" not in name]
if len(temp) > 1: 
    if temp[len(temp)-1] > temp[len(temp)-2]:
        var_ = temp[len(temp)-1]
    else: 
        var_ = temp[len(temp)-2]
else: 
    var_ = temp[0]

# Select variables of interest
vars_ = ["latitude","longitude",var_]
pop   = pop[vars_]
pop   = pop.rename(columns = {var_:"population"})

# Rename variables
pop.columns = [re.sub("_\d+", "", name) for name in pop.columns]

In [9]:
# Convert population .csv to .gpd
geometry = gpd.points_from_xy(pop['longitude'], pop['latitude'])
pop_geo  = gpd.GeoDataFrame(pop.copy(), geometry = geometry, crs = 4326)

### 2.2. Country shapefile

In [10]:
# Import data
code = code.lower()
file = f"Geospatial Basemaps/Cartographic Boundary Files/LAC-26/level-0/{code}-level-0.shp"
path = scldatalake + file
shp_ = gpd.read_file(path)

## 3. Map adjustment

In [11]:
# Keep points inside country/region of interets
pop_geo_adj = gpd.clip(pop_geo, shp_)

In [12]:
%%capture
# Export to Data Lake as .csv.gz 
file = pop_geo_adj.copy()
file = file.drop(columns = "geometry")
path = "De3velopment Data Partnership/Facebook - High resolution population density map/public-fb-data/csv/"
path = scldatalake + f"{path}/{code.upper()}/{item}"
file.to_csv(path, compression = 'gzip')

In [13]:
# New data 
print(pop_geo_adj.shape)
print(pop_geo_adj.population.sum())

(6977167, 4)
50830740.49238202


In [118]:
# Test update dataset
# Antes: 3.5M x 3, 48M
temp = "Development Data Partnership/Facebook - High resolution population density map/public-fb-data/csv/COL/COL_total_population.csv.gz"
temp = scldatalake + temp 
temp = pd.read_csv(temp, sep = "\t")

print(temp.shape)
print(temp.population.sum())

(3523846, 3)
48228262.601337105
