## Creating a production table

In [None]:
# Import packages
import pandas as pd
import numpy as np 
import requests 
import zipfile
import psycopg2
import sqlalchemy

import _functions_sql as fs
import _functions_data_files as fdf

# Specify source directory and file (required for the fdf function to work as expected)
source_dir = 'faostat_prod'
source_file = 'Production_Crops_Livestock_E_All_Data_(Normalized).csv'

### Import & examine crops & livestock production data from csv

In [None]:
# Import raw data into a pandas dataframe
df_prod = pd.read_csv(
      fdf.get_path(source_file, source_dir)
    , encoding='latin-1'
    , converters={'Note': str} # handle DtypeWarning without 'low_memory=False' 
)

In [None]:
df_prod.head()

In [None]:
df_prod.info()

In [None]:
# Check for full duplicates
df_prod.duplicated().value_counts()

In [None]:
# Check for NAs
df_prod.isnull().value_counts()

In [None]:
# List unique values in the Note column
print(df_prod["Note"].unique())

In [None]:
# Dropping unneeded columns
df_prod = df_prod.drop(columns = 'Area Code (M49)')
df_prod = df_prod.drop(columns = 'Item Code (CPC)')
df_prod = df_prod.drop(columns = 'Year Code')
df_prod = df_prod.drop(columns = 'Flag')
df_prod = df_prod.drop(columns = 'Note')

In [None]:
# Renaming the columns
df_prod = df_prod.rename(columns={'Area Code' : 'area_code', 'Area' : 'area',
                                  'Item Code' : 'item_code', 'Item' : 'item',
                                  'Element Code' : 'element_code', 'Element' : 'element',
                                  'Year' : 'year', 'Unit' : 'unit', 'Value' : 'value'})

In [None]:
df_prod.head()

In [None]:
# Import additional table for AreaCodes (provided by the same download zip)
source_file = 'Production_Crops_Livestock_E_AreaCodes.csv'
df_prod_areacodes = pd.read_csv(
      fdf.get_path(source_file, source_dir)
    , encoding='latin-1' 
)

In [None]:
df_prod_areacodes.head()

In [None]:
# Verify no duplicated rows are present
df_prod_areacodes.duplicated(keep='first').sum()

In [None]:
# Left merge on 'df_prod' to inspect potentially redundant columns
df_prod_temp = df_prod.merge(df_prod_areacodes, how='left', left_on='area_code', right_on='Area Code')

In [None]:
df_prod_temp.head()

In [None]:
# show divergent area values before and after merge
df_prod_temp[['area', 'Area']][df_prod_temp['area'] != df_prod_temp['Area']].drop_duplicates()

In [None]:
# compare divergent area values after replacing ';' with ','
df_prod_temp['Area'] = df_prod_temp['Area'].str.replace(';', ',')
df_prod_temp[['area', 'Area']][df_prod_temp['area'] != df_prod_temp['Area']].drop_duplicates()

In [None]:
# drop 'area_code' column as it contains no additional info
df_prod = df_prod.drop(columns = ['area_code'])

In [None]:
# Import additional table for AreaCodes (provided by the same download zip)
source_file = 'Production_Crops_Livestock_E_ItemCodes.csv'
df_prod_itemcodes = pd.read_csv(
      fdf.get_path(source_file, source_dir)
    , encoding='latin-1' 
)

In [None]:
df_prod_itemcodes.head()

In [None]:
# verify no duplicated rows are present
df_prod_itemcodes.duplicated(keep='first').sum()

In [None]:
# left merge on 'df_prod' to inspect potentially redundant columns
df_prod_temp = df_prod.merge(df_prod_itemcodes, how='left', left_on='item_code', right_on='Item Code')

In [None]:
df_prod_temp.head()

In [None]:
# show divergent area values before and after merge
df_prod_temp[['item', 'Item']][df_prod_temp['item'] != df_prod_temp['Item']].drop_duplicates()

In [None]:
# compare divergent Item values after replacing ';' with ','
df_prod_temp['Item'] = df_prod_temp['Item'].str.replace(';', ',')
df_prod_temp[['item', 'Item']][df_prod_temp['Item'] != df_prod_temp['Item']].drop_duplicates()

In [None]:
df_prod[['element_code', 'element', 'unit']].drop_duplicates().sort_values('element')

In [None]:
#Element_code stands for a unique combination of element and unit; can be dropped
df_prod = df_prod.drop(columns = ['element_code'])	

In [None]:
df_prod.head()

In [None]:
# rearranging columns
df_prod = df_prod[['area', 'year', 'item', 'item_code', 'element', 'unit', 'value']]

In [None]:
fs.write_dataframe(df_prod, 'fao_production')