### Creating an additional population table

Extracting population data from the food balances table in case this data is required for the analysis. 

In [195]:
# Import packages
import pandas as pd
import numpy as np 
import requests 
import zipfile
import psycopg2
import sqlalchemy
import _functions_sql as fs
import _functions_data_files as fdf

In [196]:
# Import CSV file
source_dir = 'fao_population'
file_name = 'Population_E_All_Data_(Normalized).csv'
# define converters for 'read_csv' & import file
conv = {
    'Note': str # handle DtypeWarning not using 'low_memory=False' (deprecated)
                # so missing values are now read as empty string ('')
}
population = pd.read_csv(
      fdf.get_file_path(file_name, source_dir)
    , encoding='latin-1'
    , converters=conv
)

In [197]:
population.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1950,1950,1000 No,7480.461,X,
1,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1951,1951,1000 No,7571.537,X,
2,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1952,1952,1000 No,7667.533,X,
3,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1953,1953,1000 No,7764.546,X,
4,2,'004,Afghanistan,3010,Population - Est. & Proj.,511,Total Population - Both sexes,1954,1954,1000 No,7864.285,X,


In [198]:
# Check the date range of the table
population['Year'].value_counts()

Year
2025    1343
2032    1343
2030    1343
2029    1343
2028    1343
        ... 
2070     807
2071     807
2072     807
2073     807
2100     807
Name: count, Length: 151, dtype: int64

In [199]:
# Check for full duplicates
population.duplicated().value_counts()

False    169142
Name: count, dtype: int64

Looking at the items contained in the dataset, we can see that area population is included along with the food types. We can separate this information into its own table.

In [200]:
# Getting a list of unique values in the unit column
population['Unit'].unique()

array(['1000 No'], dtype=object)

In [201]:
# Getting a list of unique values in the element column
population['Element'].unique()

array(['Total Population - Both sexes', 'Total Population - Male',
       'Total Population - Female', 'Rural population',
       'Urban population'], dtype=object)

In [202]:
# Drop unneeded columns
population.drop(columns = ['Year Code', 'Area Code (M49)', 'Area Code', 'Item Code', 'Item', 'Element Code', 'Unit', 'Flag', 'Note'], inplace=True)

In [203]:
population = population.pivot_table(index=['Area','Year'],
                    columns='Element', values='Value', aggfunc='sum').reset_index()

In [204]:
population.head(50)

Element,Area,Year,Rural population,Total Population - Both sexes,Total Population - Female,Total Population - Male,Urban population
0,Afghanistan,1950,7286.991,7480.461,3518.202,3962.26,465.127
1,Afghanistan,1951,7352.856,7571.537,3569.936,4001.601,486.654
2,Afghanistan,1952,7425.363,7667.533,3624.224,4043.308,509.617
3,Afghanistan,1953,7504.561,7764.546,3679.045,4085.501,534.035
4,Afghanistan,1954,7590.37,7864.285,3735.084,4129.202,560.077
5,Afghanistan,1955,7682.763,7971.931,3794.838,4177.092,587.818
6,Afghanistan,1956,7781.641,8087.727,3858.49,4229.236,617.389
7,Afghanistan,1957,7886.997,8210.201,3925.235,4284.967,648.81
8,Afghanistan,1958,7998.664,8333.826,3992.345,4341.481,682.282
9,Afghanistan,1959,8116.554,8468.211,4064.438,4403.774,717.891


In [205]:
# Drop columns
population.drop(columns = ['Total Population - Female', 'Total Population - Male'], inplace=True)

In [206]:
# Rename columns
population.rename(columns = {'Area':'area', 'Year':'year', 'Rural population':'rural_population', 'Total Population - Both sexes':'total_population', 'Urban population': 'urban_population'}, inplace=True)

In [207]:
# Removing the name from the index column
population = population.rename_axis(None, axis=1)

In [208]:
# Checking for duplicates 
population[['area', 'year']].duplicated().value_counts()

False    39301
Name: count, dtype: int64

In [209]:
# Multiplying the population columns since the original values are in 1000s
population.loc[:, ['rural_population', 'urban_population', 'total_population']] *= 1000

In [210]:
population = population.reindex(columns=['area', 'year', 'rural_population', 'urban_population', 'total_population'])

In [211]:
# Changing data type for population columns to integer
population[['rural_population', 'urban_population', 'total_population']] = population[['rural_population', 'urban_population', 'total_population']].round()

In [212]:
population[['rural_population', 'urban_population', 'total_population']] = population[['rural_population', 'urban_population', 'total_population']].round().astype('Int64')


In [213]:
# Export the table
engine = fs.get_engine()
schema = 'capstone_envirolytics'
table_name = 'fao_population'

if engine!=None:
    try:
        population.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schema that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The fao_population table was imported successfully.
