## Creating an additional population table

Importing population data from the FAO population table in case this data is required for the analysis, cleaning the data and exporting it as a SQL table. 

In [2]:
# Import packages
import pandas as pd
import numpy as np 


import _functions_sql as fs
import _functions_data_files as fdf

# Specify source directory and file (required for the fdf function to work as expected)
source_dir = 'fao_population'
source_file = 'Population_E_All_Data_(Normalized).csv'

In [3]:
# Import raw data into a pandas dataframe
population = pd.read_csv(
      fdf.get_path(source_file, source_dir)
    , encoding='latin-1'
    , converters={'Note': str} # handle DtypeWarning without 'low_memory=False' 
)

In [4]:
# Check the date range of the table
population['Year'].value_counts()

Year
2025    1343
2032    1343
2030    1343
2029    1343
2028    1343
        ... 
2070     807
2071     807
2072     807
2073     807
2100     807
Name: count, Length: 151, dtype: int64

In [5]:
# Check for full duplicates
population.duplicated().value_counts()

False    169142
Name: count, dtype: int64

In [6]:
# Getting a list of unique values in the unit column
population['Unit'].unique()

array(['1000 No'], dtype=object)

In [7]:
# Getting a list of unique values in the element column
population['Element'].unique()

array(['Total Population - Both sexes', 'Total Population - Male',
       'Total Population - Female', 'Rural population',
       'Urban population'], dtype=object)

In [8]:
# Drop unneeded columns
population.drop(columns = ['Year Code', 'Area Code (M49)', 'Area Code', 'Item Code', 'Item', 'Element Code', 'Unit', 'Flag', 'Note'], inplace=True)

In [9]:
# Group the population data by area and year
population = population.pivot_table(index=['Area','Year'],
                    columns='Element', values='Value', aggfunc='sum').reset_index()

In [10]:
population.head()

Element,Area,Year,Rural population,Total Population - Both sexes,Total Population - Female,Total Population - Male,Urban population
0,Afghanistan,1950,7286.991,7480.461,3518.202,3962.26,465.127
1,Afghanistan,1951,7352.856,7571.537,3569.936,4001.601,486.654
2,Afghanistan,1952,7425.363,7667.533,3624.224,4043.308,509.617
3,Afghanistan,1953,7504.561,7764.546,3679.045,4085.501,534.035
4,Afghanistan,1954,7590.37,7864.285,3735.084,4129.202,560.077


In [11]:
# Drop unneeded columns - we do not require the split into female & male
population.drop(columns = ['Total Population - Female', 'Total Population - Male'], inplace=True)

In [12]:
# Rename columns
population.rename(columns = {'Area':'area', 'Year':'year', 'Rural population':'rural_population', 'Total Population - Both sexes':'total_population', 'Urban population': 'urban_population'}, inplace=True)

In [13]:
# Removing the 'Element' name from the index column
population = population.rename_axis(None, axis=1)

In [14]:
# Checking for duplicates 
population[['area', 'year']].duplicated().value_counts()

False    39301
Name: count, dtype: int64

In [15]:
# Multiplying the population columns since the original values are in 1000s
population.loc[:, ['rural_population', 'urban_population', 'total_population']] *= 1000

In [16]:
population = population.reindex(columns=['area', 'year', 'rural_population', 'urban_population', 'total_population'])

In [17]:
population[['rural_population', 'urban_population', 'total_population']] = population[['rural_population', 'urban_population', 'total_population']].round().astype('Int64')


In [18]:
# Export the dataframe
fs.write_dataframe(population, 'fao_population')

+ table written: fao_population
