## Creating an additional population table

Importing population data from the FAO population table in case this data is required for the analysis, cleaning the data and exporting it as a SQL table. 

In [None]:
# Import packages
import pandas as pd
import numpy as np 


import _functions_sql as fs
import _functions_data_files as fdf

# Specify source directory and file (required for the fdf function to work as expected)
source_dir = 'fao_population'
source_file = 'Population_E_All_Data_(Normalized).csv'

In [None]:
# Import raw data into a pandas dataframe
population = pd.read_csv(
      fdf.get_path(source_file, source_dir)
    , encoding='latin-1'
    , converters={'Note': str} # handle DtypeWarning without 'low_memory=False' 
)

In [None]:
# Check the date range of the table
population['Year'].value_counts()

In [None]:
# Check for full duplicates
population.duplicated().value_counts()

In [None]:
# Getting a list of unique values in the unit column
population['Unit'].unique()

In [None]:
# Getting a list of unique values in the element column
population['Element'].unique()

In [None]:
# Drop unneeded columns
population.drop(columns = ['Year Code', 'Area Code (M49)', 'Area Code', 'Item Code', 'Item', 'Element Code', 'Unit', 'Flag', 'Note'], inplace=True)

In [None]:
# Group the population data by area and year
population = population.pivot_table(index=['Area','Year'],
                    columns='Element', values='Value', aggfunc='sum').reset_index()

In [None]:
population.head()

In [None]:
# Drop unneeded columns - we do not require the split into female & male
population.drop(columns = ['Total Population - Female', 'Total Population - Male'], inplace=True)

In [None]:
# Rename columns
population.rename(columns = {'Area':'area', 'Year':'year', 'Rural population':'rural_population', 'Total Population - Both sexes':'total_population', 'Urban population': 'urban_population'}, inplace=True)

In [None]:
# Removing the 'Element' name from the index column
population = population.rename_axis(None, axis=1)

In [None]:
# Checking for duplicates 
population[['area', 'year']].duplicated().value_counts()

In [None]:
# Multiplying the population columns since the original values are in 1000s
population.loc[:, ['rural_population', 'urban_population', 'total_population']] *= 1000

In [None]:
population = population.reindex(columns=['area', 'year', 'rural_population', 'urban_population', 'total_population'])

In [None]:
population[['rural_population', 'urban_population', 'total_population']] = population[['rural_population', 'urban_population', 'total_population']].round().astype('Int64')


In [None]:
# Export the table
engine = fs.get_engine()
schema = 'capstone_envirolytics'
table_name = 'fao_population'

if engine!=None:
    try:
        population.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schema that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None