### Creating an additional population table

Extracting population data from the food balances table in case this data is required for the analysis. 

In [52]:
# Import packages
import pandas as pd
import numpy as np 
import requests 
import zipfile
import psycopg2
import sqlalchemy
import _functions_sql as fs
import _functions_data_files as fdf

In [53]:
# Import CSV file
source_dir = 'faostat_food'
file_name = 'FoodBalanceSheets_E_All_Data_(Normalized).csv'
# define converters for 'read_csv' & import file
conv = {
    'Note': str # handle DtypeWarning not using 'low_memory=False' (deprecated)
                # so missing values are now read as empty string ('')
}
food_balances = pd.read_csv(
      fdf.get_file_path(file_name, source_dir)
    , encoding='latin-1'
    , converters=conv
)

In [54]:
food_balances.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (FBS),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,2010,2010,1000 No,28189.67,X
1,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,2011,2011,1000 No,29249.16,X
2,3,'008,Albania,2501,'S2501,Population,511,Total Population - Both sexes,2010,2010,1000 No,2913.4,X
3,3,'008,Albania,2501,'S2501,Population,511,Total Population - Both sexes,2011,2011,1000 No,2900.65,X
4,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,2012,2012,1000 No,30466.48,X


In [55]:
# Check the date range of the table
food_balances['Year'].value_counts()

Year
2021    369154
2020    368661
2019    366960
2017    359076
2018    358735
2016    357987
2014    357970
2015    357899
2013    357396
2012    357061
2010    355189
2011    354820
Name: count, dtype: int64

Since this data only covers the years between 2010-2021, we want to combine it with an additional dataset covering the years before. 

In [56]:
# Import CSV file
source_dir = 'faostat_food'
file_name = 'FoodBalanceSheetsHistoric_E_All_Data_(Normalized).csv'
# define converters for 'read_csv' & import file
conv = {
    'Note': str # handle DtypeWarning not using 'low_memory=False' (deprecated)
                # so missing values are now read as empty string ('')
}
food_balances_hist = pd.read_csv(
      fdf.get_file_path(file_name, source_dir)
    , encoding='latin-1'
    , converters=conv
)

In [57]:
# Check the date range of the table
food_balances_hist['Year'].value_counts()

Year
2010    231150
2009    231150
2008    231150
2007    231150
2006    231150
2011    230417
2001    230296
2002    230296
2000    230296
2005    230296
2004    230296
2003    230296
2013    230201
2012    230201
1999    229323
1998    229323
1997    229323
1996    229323
1995    229323
1994    229323
1993    229323
1992    227916
1991    208024
1990    208024
1988    206977
1989    206977
1961    206977
1962    206977
1986    206977
1963    206977
1964    206977
1965    206977
1966    206977
1967    206977
1968    206977
1969    206977
1970    206977
1971    206977
1972    206977
1973    206977
1974    206977
1975    206977
1976    206977
1977    206977
1978    206977
1979    206977
1980    206977
1981    206977
1982    206977
1983    206977
1984    206977
1985    206977
1987    206977
Name: count, dtype: int64

In [58]:
# Combining the tables
food_balances_combined = pd.concat([food_balances,food_balances_hist], ignore_index=True)

In [59]:
# Check for duplicates after combining the tables
food_balances_combined.duplicated().value_counts()

False    15491348
True       309463
Name: count, dtype: int64

In [60]:
# Dropping duplicated rows
food_balances_combined.drop_duplicates(inplace=True)

Looking at the items contained in the dataset, we can see that area population is included along with the food types. We can separate this information into its own table.

In [61]:
# Getting a list of unique values in the element column
food_balances_combined['Item'].unique()

array(['Population', 'Grand Total', 'Vegetal Products', 'Animal Products',
       'Cereals - Excluding Beer', 'Wheat and products', 'Animal fats',
       'Fish, Body Oil', 'Fish, Liver Oil', 'Fish, Seafood',
       'Freshwater Fish', 'Demersal Fish', 'Rice and products',
       'Pelagic Fish', 'Marine Fish, Other', 'Crustaceans', 'Cephalopods',
       'Barley and products', 'Molluscs, Other',
       'Aquatic Animals, Others', 'Aquatic Products, Other',
       'Aquatic Plants', 'Maize and products', 'Rye and products', 'Oats',
       'Millet and products', 'Sorghum and products', 'Cereals, Other',
       'Starchy Roots', 'Cassava and products', 'Potatoes and products',
       'Sweet potatoes', 'Roots, Other', 'Sugar Crops', 'Sugar cane',
       'Sugar & Sweeteners', 'Sugar (Raw Equivalent)', 'Yams',
       'Sweeteners, Other', 'Honey', 'Pulses', 'Sugar beet', 'Beans',
       'Peas', 'Pulses, Other and products', 'Treenuts',
       'Nuts and products', 'Oilcrops', 'Soyabeans', 'Groundnut

In [62]:
# Create a population dataframe
area_population = food_balances_combined[food_balances_combined['Item'] == 'Population']

In [63]:
area_population.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (FBS),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,2010,2010,1000 No,28189.67,X
1,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,2011,2011,1000 No,29249.16,X
2,3,'008,Albania,2501,'S2501,Population,511,Total Population - Both sexes,2010,2010,1000 No,2913.4,X
3,3,'008,Albania,2501,'S2501,Population,511,Total Population - Both sexes,2011,2011,1000 No,2900.65,X
4,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,2012,2012,1000 No,30466.48,X


In [64]:
# Drop unneeded columns
area_population.drop(columns = ['Year Code', 'Area Code (M49)', 'Area Code', 'Item Code (FBS)', 'Item Code', 'Element Code', 'Element', 'Unit', 'Flag'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  area_population.drop(columns = ['Year Code', 'Area Code (M49)', 'Area Code', 'Item Code (FBS)', 'Item Code', 'Element Code', 'Element', 'Unit', 'Flag'], inplace=True)


In [65]:
# Rename the remaining columns
area_population.rename(columns = {'Area':'area', 'Item':'item', 'Year':'year', 'Value':'total_population'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  area_population.rename(columns = {'Area':'area', 'Item':'item', 'Year':'year', 'Value':'total_population'}, inplace=True)


In [66]:
# Multiplying the total_population since the original values are in 1000s
area_population.loc[:, 'total_population'] *= 1000

In [67]:
# Export the table
engine = fs.get_engine()
schema = 'capstone_envirolytics'
table_name = 'fao_population'

if engine!=None:
    try:
        area_population.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schema that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The fao_population table was imported successfully.
