In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import os

## Geospatial data for population growth in African countries
This is a short script to process the Population Count, v4.11 (2000, 2005, 2010, 2015, 2020), with the gridded population of the world in 5km grids. You can find this dataset [here](https://sedac.ciesin.columbia.edu/data/set/gpw-v4-population-count-rev11/data-download). This dataset is part of the Earth Engine datasets [here](https://developers.google.com/earth-engine/datasets/catalog/CIESIN_GPWv411_GPW_Population_Count?hl=en#citations). It could also be accessed through an API.

### References

Center for International Earth Science Information Network - CIESIN - Columbia University. 2018. Gridded Population of the World, Version 4 (GPWv4): Population Count, Revision 11. Palisades, NY: NASA Socioeconomic Data and Applications Center (SEDAC). https://doi.org/10.7927/H4JW8BX5. Accessed 24 Mar 2023.

In [58]:
# Importing the whole dataset and the country references
data_pop = 'gpw-v4-population-count-rev11_totpop_2pt5_min_nc/gpw_v4_population_count_rev11_2pt5_min.nc'
countries_df = pd.read_csv('gpw-v4-population-count-rev11_totpop_2pt5_min_nc/gpw_v4_national_identifier_grid_rev11_lookup.txt', sep="\t")

# The ds has 3 dimensions and has long x lat x rasters elements
ds = xr.open_dataset(data_pop)

# These are the raster names for clarity
raster_name = {1: 'Adjusted-Population-Count_2000',
               2: 'Adjusted-Population-Count_2005',
               3: 'Adjusted-Population-Count_2010',
               4: 'Adjusted-Population-Count_2015',
               5: 'Adjusted-Population-Count_2020',
               6: 'Data-Quality_Data-Context_2010',
               7: 'Data-Quality_Mean-Administrative-Unit-Area_2010',
               8: 'Data-Quality_Water-Mask_2010',
               9: 'Land-Area_2010',
               10: 'Water-Area_2010',
               11: 'National-Identifier-Grid_2010',
               12: 'Data-Code_2010',
               13: 'Input-Data-Year_2010',
               14: 'Input-Data-Level_2010',
               15: 'Input-Data-Sex_2010',
               16: 'Input-Data-Age_2010',
               17: 'Growth-Rate-Start-Year_2010',
               18: 'Growth-Rate-End-Year_2010',
               19: 'Growth_Rate-Admin-Level_2010',
               20: 'Year-most-recent-Census_2010'}

In [137]:
# The smallest I can get is country level information, pre-filter of the cities
# I will get the countries from the decided top 20 cities 
countries_to_filter = ['AGO', 'CMR', 'CIV', 'EGY', 'ETH', 
                       'MAR', 'NGA', 'ZAF', 'SDN', 'TZA', 
                       'COD', 'MDG', 'KEN']
filtered_countries_df = countries_df[countries_df['ISOCODE'].isin(countries_to_filter)]
filtered_countries_list = filtered_countries_df['Value'].tolist()

In [138]:
# Viewing the countries
filtered_countries_df

Unnamed: 0,Value,ISOCODE,UNSDCODE,NAME0,CIESINCODE,DATATYPE,DATACODE,DATAYEAR,DATALEVEL,SEXLEVEL,AGELEVEL,GRSTART,GREND,GRLEVEL,LASTCENSUS,MEANUNITKM
5,24,AGO,24,Angola,24,Preliminary/provisional census,4,2014,2,2,0,2006,2014,1,2014,17505.19519
32,120,CMR,120,Cameroon,120,Final census,1,2005,2,2,2,1987,2005,2,2005,17306.1465
46,180,COD,180,Democratic Republic of the Congo,180,Population estimate/projection,6,2008,3,0,0,1998,2008,1,1984,21383.7208
60,231,ETH,231,Ethiopia,231,Final census,1,2007,3,3,3,1994,2007,1,2007,5119.696574
101,384,CIV,384,Cote d'Ivoire,384,Final census,1,2014,4,4,0,1998,2014,3,2014,1226.860496
106,404,KEN,404,Kenya,404,Final census,1,2009,5,5,2,1999,2009,1,2009,976.187782
121,450,MDG,450,Madagascar,450,Population estimate/projection,6,2010,4,0,0,2003,2010,3,1993,1118.075327
136,504,MAR,504,Morocco,504,Final census,1,2014,3,3,3,2004,2014,2,2014,1536.774064
152,566,NGA,566,Nigeria,566,Final census,1,2006,2,2,2,1991,2006,1,2006,2898.284848
197,710,ZAF,710,South Africa,710,Final census,1,2011,6,6,6,2001,2011,3,2011,1588.211535


In [139]:
# Raster 11 represents the country codes, so I filtered the countries in raster 11 and then broadcasted
# to get every piece of information for the countries
# This takes a minute or two running
filtered_ds = ds.sel(raster=11).isin(filtered_countries_list)
filtered2, _ = xr.broadcast(filtered_ds, ds)
ds_africa = ds.where(filtered2, drop=True)

In [140]:
# Here I am just cleaning turning the ds into a df and cleaning it, dropping NAs, adding labels to rasters
Top_20_Africa_df = ds_africa.to_dataframe().dropna().reset_index()\
                            .rename(columns = {'Population Count, v4.11 (2000, 2005, 2010, 2015, 2020): 2.5 arc-minutes': 'value'})


In [141]:
# Just doing a bit more cleaning
Top_20_Africa_df['raster_name'] = Top_20_Africa_df['raster'].apply(lambda x: raster_name[x])
Top_20_Africa_df2 = Top_20_Africa_df.drop(columns = ['raster'])\
                                    .pivot(index= ['latitude', 'longitude'],
                                           columns = ['raster_name'],
                                           values = ['value'])

# this is to paste the country names, I just did it quickly with dict and parallel computing because I
# am scared of joinig the tables
country_values = filtered_countries_df['Value'].tolist()
country_names = filtered_countries_df['NAME0'].tolist()
country_dic = dict(zip(country_values,country_names))

Top_20_Africa_df2[('value','Country_Name')] = Top_20_Africa_df2[('value','National-Identifier-Grid_2010')].apply(lambda x: country_dic[x])
Top_20_Africa_df2.columns = Top_20_Africa_df2.columns.map(lambda x: x[1])

In [144]:
# Printing the document
Top_20_Africa_df2.to_csv('Top-20_population-count_2000-2020.csv')
Top_20_Africa_df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Adjusted-Population-Count_2000,Adjusted-Population-Count_2005,Adjusted-Population-Count_2010,Adjusted-Population-Count_2015,Adjusted-Population-Count_2020,Data-Code_2010,Data-Quality_Data-Context_2010,Data-Quality_Mean-Administrative-Unit-Area_2010,Data-Quality_Water-Mask_2010,Growth-Rate-End-Year_2010,...,Growth_Rate-Admin-Level_2010,Input-Data-Age_2010,Input-Data-Level_2010,Input-Data-Sex_2010,Input-Data-Year_2010,Land-Area_2010,National-Identifier-Grid_2010,Water-Area_2010,Year-most-recent-Census_2010,Country_Name
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
-46.979167,37.604167,,,,,,1.0,,0.0,1.0,2011.0,...,3.0,6.0,6.0,6.0,2011.0,1.405585,710.0,1.530877,2011.0,South Africa
-46.979167,37.645833,,,,,,1.0,,0.0,1.0,2011.0,...,3.0,6.0,6.0,6.0,2011.0,4.58009,710.0,1.292559,2011.0,South Africa
-46.979167,37.6875,,,,,,1.0,,0.0,1.0,2011.0,...,3.0,6.0,6.0,6.0,2011.0,3.415395,710.0,1.856529,2011.0,South Africa
-46.979167,37.729167,,,,,,1.0,,0.0,1.0,2011.0,...,3.0,6.0,6.0,6.0,2011.0,4.529518,710.0,1.343131,2011.0,South Africa
-46.979167,37.770833,,,,,,1.0,,0.0,1.0,2011.0,...,3.0,6.0,6.0,6.0,2011.0,1.793158,710.0,1.143395,2011.0,South Africa


In [145]:
# Checking the population as of 2020 just to be cautious
Top_20_Africa_df2[['Country_Name', 'Adjusted-Population-Count_2020']].groupby('Country_Name').sum()

Unnamed: 0_level_0,Adjusted-Population-Count_2020
Country_Name,Unnamed: 1_level_1
Angola,36502012.0
Cameroon,28279994.0
Cote d'Ivoire,26556614.0
Democratic Republic of the Congo,114290872.0
Egypt,96195904.0
Ethiopia,103989768.0
Kenya,55127572.0
Madagascar,27900930.0
Morocco,35810256.0
Nigeria,216673952.0


In [152]:
half_point = Top_20_Africa_df2.shape[0]//2

In [154]:
Top_20_Africa_df2.iloc[:half_point].to_csv('Top-20_population-count_2000-2020_part1.csv')
Top_20_Africa_df2.iloc[half_point:].to_csv('Top-20_population-count_2000-2020_part2.csv')