In [347]:
# Import packages
import pandas as pd
import numpy as np 
import requests 
import zipfile
import psycopg2
import sqlalchemy

import _functions_sql as fs
import _functions_data_files as fdf

### Import & examine crops & livestock production data

In [46]:
# Create a function for downloading the data

def data_download():
     
    r = requests.get(url)
    with open(path+zip_file, 'wb') as f:
        f.write(r.content)

    data = zipfile.ZipFile(path+zip_file, 'r')
    data.extract(csv_file, path)

In [7]:
# Define the parameters for the download

url = f'https://fenixservices.fao.org/faostat/static/bulkdownloads/Production_Crops_Livestock_E_All_Data_(Normalized).zip'
zip_file = 'Production_Crops_Livestock_E_All_Data_(Normalized).zip'
path = './data/'
csv_file = 'Production_Crops_Livestock_E_All_Data_(Normalized).csv'

In [41]:
# Run the data download function 
data_download()

In [36]:
# Import crops & livestock data for all countries
production = pd.read_csv(path+csv_file, encoding='latin-1', low_memory=False)

In [37]:
production.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1975,1975,ha,0.0,E,
1,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1976,1976,ha,5900.0,E,
2,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1977,1977,ha,6000.0,E,
3,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1978,1978,ha,6000.0,E,
4,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1979,1979,ha,6000.0,E,


In [38]:
production.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3761168 entries, 0 to 3761167
Data columns (total 14 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Area Code        int64  
 1   Area Code (M49)  object 
 2   Area             object 
 3   Item Code        int64  
 4   Item Code (CPC)  object 
 5   Item             object 
 6   Element Code     int64  
 7   Element          object 
 8   Year Code        int64  
 9   Year             int64  
 10  Unit             object 
 11  Value            float64
 12  Flag             object 
 13  Note             object 
dtypes: float64(1), int64(5), object(8)
memory usage: 401.7+ MB


In [18]:
# Check for full duplicates
production.duplicated().value_counts()

False    3761168
Name: count, dtype: int64

In [19]:
# Check for NAs
production.isnull().value_counts()

Area Code  Area Code (M49)  Area   Item Code  Item Code (CPC)  Item   Element Code  Element  Year Code  Year   Unit   Value  Flag   Note 
False      False            False  False      False            False  False         False    False      False  False  False  False  True     3673453
                                                                                                                                    False      87715
Name: count, dtype: int64

In [39]:
# List unique values in the Note column
print(production["Note"].unique())

# Drop the 'Note' column as it does not contain relevant info
production = production.drop(columns = 'Note')

[nan 'Unofficial figure']


'Year Code' and 'Year' columns contain the same data, so we can drop one of them. 

In [40]:
# Drop 'Year code' column
production = production.drop(columns = 'Year Code')

Exploring the element column, we see that not all values are relevant for further analysis. According to element definitions, yield is the production per unit of harvested area 

In [43]:
production["Element"].value_counts()

Element
Production                       1446915
Yield                             888204
Area harvested                    793556
Producing Animals/Slaughtered     244427
Stocks                            169356
Yield/Carcass Weight              137691
Milk Animals                       43528
Laying                             29531
Prod Popultn                        7960
Name: count, dtype: int64

### Import emissions from crops data

In [44]:
url = f'https://fenixservices.fao.org/faostat/static/bulkdownloads/Emissions_crops_E_All_Data_(Normalized).zip'
zip_file = 'Emissions_crops_E_All_Data_(Normalized).zip'
csv_file = 'Emissions_crops_E_All_Data_(Normalized).csv'

In [45]:
# Run the data download function 
data_download()

In [46]:
# Import crops emissions data for all countries
emissions_crops = pd.read_csv(path+csv_file, encoding='latin-1', low_memory=False)

In [47]:
emissions_crops.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Source Code,Source,Unit,Value,Flag,Note
0,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1961,1961,3050,FAO TIER 1,kt,0.1141,E,
1,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1962,1962,3050,FAO TIER 1,kt,0.1141,E,
2,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1963,1963,3050,FAO TIER 1,kt,0.1141,E,
3,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1964,1964,3050,FAO TIER 1,kt,0.1145,E,
4,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1965,1965,3050,FAO TIER 1,kt,0.1145,E,


### Import emissions from livestock data

In [48]:
url = f'https://fenixservices.fao.org/faostat/static/bulkdownloads/Emissions_livestock_E_All_Data_(Normalized).zip'
zip_file = 'Emissions_livestock_E_All_Data_(Normalized).zip'
csv_file = 'Emissions_livestock_E_All_Data_(Normalized).csv'

In [49]:
# Run the data download function 
data_download()

In [50]:
# Import crops emissions data for all countries
emissions_livestock = pd.read_csv(path+csv_file, encoding='latin-1', low_memory=False)

In [51]:
emissions_livestock.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Source Code,Source,Unit,Value,Flag,Note
0,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1961,1961,3050,FAO TIER 1,An,1300000.0,A,
1,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1962,1962,3050,FAO TIER 1,An,851850.0,A,
2,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1963,1963,3050,FAO TIER 1,An,1001112.0,A,
3,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1964,1964,3050,FAO TIER 1,An,1150000.0,E,
4,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1965,1965,3050,FAO TIER 1,An,1300000.0,A,


### Import food balances data

In [307]:
url = f'https://fenixservices.fao.org/faostat/static/bulkdownloads/FoodBalanceSheets_E_All_Data_(Normalized).zip'
zip_file = 'FoodBalanceSheets_E_All_Data_(Normalized).zip'
csv_file = 'FoodBalanceSheets_E_All_Data_(Normalized).csv'

In [53]:
# Run the data download function 
data_download()

In [308]:
# Import food balances data for all countries
food_balances = pd.read_csv(path+csv_file, encoding='latin-1', low_memory=False)

In [132]:
# Get table info
food_balances.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4320908 entries, 0 to 4320907
Data columns (total 13 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Area Code        int64  
 1   Area Code (M49)  object 
 2   Area             object 
 3   Item Code        int64  
 4   Item Code (FBS)  object 
 5   Item             object 
 6   Element Code     int64  
 7   Element          object 
 8   Year Code        int64  
 9   Year             int64  
 10  Unit             object 
 11  Value            float64
 12  Flag             object 
dtypes: float64(1), int64(5), object(7)
memory usage: 428.6+ MB


In [81]:
# Since the info above does not provide anything apart from the data type, get additional info on unique and missing values
print("Unique values and missing values(%) of each column:")

food_balances_info= pd.DataFrame({"Unique values": food_balances.nunique(),
                            "Missing values(%)": round(food_balances.isnull().sum()/food_balances.shape[0]*100, 2)
                            }).rename_axis('Columns', axis='rows')                       

food_balances_info

Unique values and missing values(%) of each column:


Unnamed: 0_level_0,Unique values,Missing values(%)
Columns,Unnamed: 1_level_1,Unnamed: 2_level_1
Area Code,221,0.0
Area Code (M49),221,0.0
Area,221,0.0
Item Code,123,0.0
Item Code (FBS),123,0.0
Item,120,0.0
Element Code,21,0.0
Element,21,0.0
Year Code,12,0.0
Year,12,0.0


In [12]:
# Check for full duplicates
food_balances.duplicated().value_counts()

False    4320908
Name: count, dtype: int64

In [309]:
food_balances.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (FBS),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,2010,2010,1000 No,28189.67,X
1,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,2011,2011,1000 No,29249.16,X
2,3,'008,Albania,2501,'S2501,Population,511,Total Population - Both sexes,2010,2010,1000 No,2913.4,X
3,3,'008,Albania,2501,'S2501,Population,511,Total Population - Both sexes,2011,2011,1000 No,2900.65,X
4,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,2012,2012,1000 No,30466.48,X


In [310]:
# Drop unneeded columns
food_balances.drop(columns = ['Year Code','Area Code (M49)', 'Item Code (FBS)'], inplace=True)

In [311]:
# Rename remaining columns
food_balances.rename(columns={'Area Code':'area_code', 'Area':'area', 'Item Code':'item_code', 'Item':'item', 'Element Code':'element_code', 'Element':'element', 'Year':'year', 'Unit':'unit', 'Value':'value', 'Flag':'flag'}, inplace=True)

In [312]:
# Check the date range of the table
food_balances['year'].value_counts()

year
2021    369154
2020    368661
2019    366960
2017    359076
2018    358735
2016    357987
2014    357970
2015    357899
2013    357396
2012    357061
2010    355189
2011    354820
Name: count, dtype: int64

Since this data only covers the years between 2010-2021, we want to combine it with an additional dataset covering the years before. 

In [313]:
url = f'https://fenixservices.fao.org/faostat/static/bulkdownloads/FoodBalanceSheetsHistoric_E_All_Data_(Normalized).zip'
zip_file = 'FoodBalanceSheetsHistoric_E_All_Data_(Normalized).zip'
csv_file = 'FoodBalanceSheetsHistoric_E_All_Data_(Normalized).csv'

In [65]:
# Run the data download function 
data_download()

In [314]:
# Import addtional food balances data for all countries
food_balances_hist = pd.read_csv(path+csv_file, encoding='latin-1', low_memory=False)

In [249]:
food_balances_hist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11479903 entries, 0 to 11479902
Data columns (total 13 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Area Code        int64  
 1   Area Code (M49)  object 
 2   Area             object 
 3   Item Code        int64  
 4   Item Code (FBS)  object 
 5   Item             object 
 6   Element Code     int64  
 7   Element          object 
 8   Year Code        int64  
 9   Year             int64  
 10  Unit             object 
 11  Value            float64
 12  Flag             object 
dtypes: float64(1), int64(5), object(7)
memory usage: 1.1+ GB


In [94]:
# Since the info above does not provide anything apart from the data type, get additional info on unique and missing values
print("Unique values and missing values(%) of each column:")

food_balances_hist_info= pd.DataFrame({"Unique values": food_balances_hist.nunique(),
                            "Missing values(%)": round(food_balances_hist.isnull().sum()/food_balances_hist.shape[0]*100, 2)
                            }).rename_axis('Columns', axis='rows')                       

food_balances_hist_info

Unique values and missing values(%) of each column:


Unnamed: 0_level_0,Unique values,Missing values(%)
Columns,Unnamed: 1_level_1,Unnamed: 2_level_1
Area Code,217,0.0
Area Code (M49),217,0.0
Area,217,0.0
Item Code,123,0.0
Item Code (FBS),123,0.0
Item,120,0.0
Element Code,16,0.0
Element,16,0.0
Year Code,53,0.0
Year,53,0.0


In [95]:
# Check for full duplicates
food_balances_hist.duplicated().value_counts()

False    11479903
Name: count, dtype: int64

In [96]:
food_balances_hist.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (FBS),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,1961,1961,1000 No,8954.0,X
1,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,1962,1962,1000 No,9142.0,X
2,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,1963,1963,1000 No,9340.0,X
3,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,1964,1964,1000 No,9547.0,X
4,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,1965,1965,1000 No,9765.0,X


In [315]:
# Drop unneeded columns
food_balances_hist.drop(columns = ['Year Code', 'Area Code (M49)', 'Item Code (FBS)'], inplace=True)

In [316]:
# Rename remaining columns
food_balances_hist.rename(columns={'Area Code':'area_code', 'Area':'area', 'Item Code':'item_code', 'Item':'item', 'Element Code':'element_code', 'Element':'element', 'Year':'year', 'Unit':'unit', 'Value':'value', 'Flag':'flag'}, inplace=True)

Looking at the number of unique values in food_balances and food_balances_hist, we see that the number of unique values in Area and Element columns differs  - to be taken into account when merging the dataframes.

In [317]:
food_balances_combined = pd.concat([food_balances,food_balances_hist], ignore_index=True)

In [181]:
food_balances_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15800811 entries, 0 to 15800810
Data columns (total 10 columns):
 #   Column        Dtype  
---  ------        -----  
 0   area_code     int64  
 1   area          object 
 2   item_code     int64  
 3   item          object 
 4   element_code  int64  
 5   element       object 
 6   year          int64  
 7   unit          object 
 8   value         float64
 9   flag          object 
dtypes: float64(1), int64(4), object(5)
memory usage: 1.2+ GB


In [318]:
food_balances_combined.head()

Unnamed: 0,area_code,area,item_code,item,element_code,element,year,unit,value,flag
0,2,Afghanistan,2501,Population,511,Total Population - Both sexes,2010,1000 No,28189.67,X
1,2,Afghanistan,2501,Population,511,Total Population - Both sexes,2011,1000 No,29249.16,X
2,3,Albania,2501,Population,511,Total Population - Both sexes,2010,1000 No,2913.4,X
3,3,Albania,2501,Population,511,Total Population - Both sexes,2011,1000 No,2900.65,X
4,2,Afghanistan,2501,Population,511,Total Population - Both sexes,2012,1000 No,30466.48,X


In [101]:
# Check for duplicates after combining the tables
food_balances_combined.duplicated().value_counts()

False    15491348
True       309463
Name: count, dtype: int64

In [319]:
# Dropping duplicated rows
food_balances_combined.drop_duplicates(inplace=True)

### Examining the item column

Looking at the items contained in the dataset, we can see that area population is included along with the food types. We can separate this information into its own table.

In [320]:
# Getting a list of unique values in the element column
food_balances_combined['item'].unique()

array(['Population', 'Grand Total', 'Vegetal Products', 'Animal Products',
       'Cereals - Excluding Beer', 'Wheat and products', 'Animal fats',
       'Fish, Body Oil', 'Fish, Liver Oil', 'Fish, Seafood',
       'Freshwater Fish', 'Demersal Fish', 'Rice and products',
       'Pelagic Fish', 'Marine Fish, Other', 'Crustaceans', 'Cephalopods',
       'Barley and products', 'Molluscs, Other',
       'Aquatic Animals, Others', 'Aquatic Products, Other',
       'Aquatic Plants', 'Maize and products', 'Rye and products', 'Oats',
       'Millet and products', 'Sorghum and products', 'Cereals, Other',
       'Starchy Roots', 'Cassava and products', 'Potatoes and products',
       'Sweet potatoes', 'Roots, Other', 'Sugar Crops', 'Sugar cane',
       'Sugar & Sweeteners', 'Sugar (Raw Equivalent)', 'Yams',
       'Sweeteners, Other', 'Honey', 'Pulses', 'Sugar beet', 'Beans',
       'Peas', 'Pulses, Other and products', 'Treenuts',
       'Nuts and products', 'Oilcrops', 'Soyabeans', 'Groundnut

In [321]:
# Create a population dataframe
area_population = food_balances_combined[food_balances_combined['item'] == 'Population']

In [322]:
area_population.head()

Unnamed: 0,area_code,area,item_code,item,element_code,element,year,unit,value,flag
0,2,Afghanistan,2501,Population,511,Total Population - Both sexes,2010,1000 No,28189.67,X
1,2,Afghanistan,2501,Population,511,Total Population - Both sexes,2011,1000 No,29249.16,X
2,3,Albania,2501,Population,511,Total Population - Both sexes,2010,1000 No,2913.4,X
3,3,Albania,2501,Population,511,Total Population - Both sexes,2011,1000 No,2900.65,X
4,2,Afghanistan,2501,Population,511,Total Population - Both sexes,2012,1000 No,30466.48,X


In [323]:
# Drop the population info from the food_balanced_combined table
food_balances_combined = food_balances_combined[food_balances_combined['item'] != 'Population']

### Examining the element column

Currently the element column contains many different indicators, the values for which can be found in the 'value' column. We want to keep only the indicators that are relevant for our analysis and separate these out into their own columns. 

In [324]:
# Getting a list of unique values in the element column
food_balances_combined['element'].unique()

array(['Food supply (kcal/capita/day)', 'Food supply (kcal)',
       'Protein supply quantity (g/capita/day)',
       'Protein supply quantity (t)',
       'Fat supply quantity (g/capita/day)', 'Fat supply quantity (t)',
       'Production', 'Import Quantity', 'Stock Variation',
       'Export Quantity', 'Domestic supply quantity', 'Feed', 'Losses',
       'Other uses (non-food)', 'Residuals', 'Seed', 'Food',
       'Food supply quantity (kg/capita/yr)', 'Processing',
       'Tourist consumption'], dtype=object)

After looking at the definitions of the different elements (found on https://www.fao.org/faostat/en/#data/FBS under 'Definitions'), we come to a conclusion that 'Food supply (kcal/capita/day)', 'Food supply (kcal)', 'Protein supply quantity (g/capita/day)','Protein supply quantity (t)', 'Fat supply quantity (g/capita/day)', 'Fat supply quantity (t)', 'Food supply quantity (kg/capita/yr)' are not very relevant in the context of our analysis and can be removed.

In [325]:
values_to_remove = ['Food supply (kcal/capita/day)', 'Food supply (kcal)', 
                    'Protein supply quantity (g/capita/day)',
                    'Protein supply quantity (t)', 'Fat supply quantity (g/capita/day)', 
                    'Fat supply quantity (t)', 'Food supply quantity (kg/capita/yr)']

In [326]:
food_balances_combined = food_balances_combined[~food_balances_combined['element'].isin(values_to_remove)]

Let's check this table to see which units are used for different metrics - if they all use the same units, we can remove the unit column and separate the element column into multiple columns. 

In [327]:
# Check unique values of the units 
food_balances_combined['unit'].unique()

array(['1000 t', '1000 An'], dtype=object)

In [328]:
# Check which elements use '1000 An' as a unit
food_balances_An = food_balances_combined[food_balances_combined['unit']=='1000 An']
food_balances_An['element'].unique()

array(['Stock Variation'], dtype=object)

Stock variation is the only metric that uses 1000 An as units. However, according to the documentation, stock variation is measured in 1000 t, meaning we need to correct the unit in the dataframe.

In [329]:
food_balances_combined['unit'].replace('1000 An', '1000 t', inplace=True)

In [262]:
food_balances_combined.head()

Unnamed: 0,area_code,area,item_code,item,element_code,element,year,unit,value,flag
2600,13,Bahrain,2905,Cereals - Excluding Beer,5511,Production,2019,1000 t,0.0,E
2655,13,Bahrain,2905,Cereals - Excluding Beer,5511,Production,2020,1000 t,0.0,E
2746,13,Bahrain,2905,Cereals - Excluding Beer,5511,Production,2021,1000 t,0.0,E
2803,13,Bahrain,2905,Cereals - Excluding Beer,5611,Import Quantity,2019,1000 t,309.0,E
2896,13,Bahrain,2905,Cereals - Excluding Beer,5611,Import Quantity,2020,1000 t,395.0,E


### Pivoting the table

In [330]:
food_balances_rest = food_balances_combined.pivot_table(index=['area_code', 'area', 'item_code', 'item', 'year', 'unit', 'flag'],
                    columns='element', values='value', aggfunc='sum').reset_index()

In [331]:
food_balances_rest.head()

element,area_code,area,item_code,item,year,unit,flag,Domestic supply quantity,Export Quantity,Feed,Food,Import Quantity,Losses,Other uses (non-food),Processing,Production,Residuals,Seed,Stock Variation,Tourist consumption
0,1,Armenia,2511,Wheat and products,1992,1000 t,I,592.0,0.0,14.0,539.0,401.0,17.0,0.0,0.0,141.0,,23.0,49.0,
1,1,Armenia,2511,Wheat and products,1993,1000 t,I,618.0,0.0,21.0,544.0,469.0,18.0,0.0,0.0,218.0,,35.0,-68.0,
2,1,Armenia,2511,Wheat and products,1994,1000 t,I,545.0,0.0,24.0,476.0,400.0,20.0,0.0,0.0,153.0,,25.0,-7.0,
3,1,Armenia,2511,Wheat and products,1995,1000 t,I,560.0,0.0,25.0,497.0,375.0,18.0,0.0,0.0,154.0,,20.0,32.0,
4,1,Armenia,2511,Wheat and products,1996,1000 t,I,510.0,0.0,10.0,455.0,293.0,18.0,0.0,0.0,201.0,,27.0,16.0,


In [332]:
#Rearrange the column order
food_balances_rest= food_balances_rest[['area_code', 'area', 'item_code', 'item', 'year','Production','Import Quantity', 'Export Quantity', 'Stock Variation','Domestic supply quantity', 'Feed', 'Food', 'Seed', 'Losses', 'Processing', 'Residuals','Tourist consumption','Other uses (non-food)', 'unit', 'flag']]

In [333]:
#Rename columns
food_balances_rest.rename(columns={'Domestic supply quantity':'domestic_supply_quantity', 'Production':'production', 'Import Quantity':'import_quantity', 'Export Quantity':'export_quantity', 'Stock Variation':'stock_variation','Domestic supply quantity':'domestic_supply_quantity','Feed':'feed', 'Food':'food', 'Seed':'seed', 'Losses':'losses','Processing':'processing', 'Residuals':'residuals', 'Tourist consumption':'tourist_consumption', 'Other uses (non-food)':'other_uses_(non-food)'}, inplace=True)

In [334]:
food_balances_rest = food_balances_rest.sort_values(by=['area', 'item', 'year'])

In [339]:
food_balances_rest.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1353333 entries, 6295 to 822762
Data columns (total 20 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   area_code                 1353333 non-null  int64  
 1   area                      1353333 non-null  object 
 2   item_code                 1353333 non-null  int64  
 3   item                      1353333 non-null  object 
 4   year                      1353333 non-null  int64  
 5   production                932726 non-null   float64
 6   import_quantity           1308929 non-null  float64
 7   export_quantity           1123195 non-null  float64
 8   stock_variation           843419 non-null   float64
 9   domestic_supply_quantity  1349438 non-null  float64
 10  feed                      367025 non-null   float64
 11  food                      1254023 non-null  float64
 12  seed                      253032 non-null   float64
 13  losses                    5133

In [343]:
f_a = food_balances_rest[(food_balances_rest['item'] == 'Wheat and products') & (food_balances_rest['area'] == 'Austria')]

In [344]:
f_a.head()

element,area_code,area,item_code,item,year,production,import_quantity,export_quantity,stock_variation,domestic_supply_quantity,feed,food,seed,losses,processing,residuals,tourist_consumption,other_uses_(non-food),unit,flag
49254,11,Austria,2511,Wheat and products,1961,712.0,50.0,46.0,-20.0,696.0,74.0,544.0,49.0,18.0,9.0,,,2.0,1000 t,I
49255,11,Austria,2511,Wheat and products,1962,706.0,70.0,90.0,0.0,686.0,76.0,529.0,52.0,18.0,9.0,,,2.0,1000 t,I
49256,11,Austria,2511,Wheat and products,1963,690.0,49.0,0.0,11.0,749.0,123.0,545.0,52.0,17.0,9.0,,,2.0,1000 t,I
49257,11,Austria,2511,Wheat and products,1964,751.0,50.0,0.0,-27.0,773.0,141.0,549.0,52.0,19.0,9.0,,,2.0,1000 t,I
49258,11,Austria,2511,Wheat and products,1965,661.0,99.0,1.0,47.0,806.0,176.0,550.0,52.0,17.0,9.0,,,2.0,1000 t,I


In [351]:
food_balances_sample = food_balances_rest.head(10000)

In [354]:
food_balances_rest.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1353333 entries, 6295 to 822762
Data columns (total 20 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   area_code                 1353333 non-null  int64  
 1   area                      1353333 non-null  object 
 2   item_code                 1353333 non-null  int64  
 3   item                      1353333 non-null  object 
 4   year                      1353333 non-null  int64  
 5   production                932726 non-null   float64
 6   import_quantity           1308929 non-null  float64
 7   export_quantity           1123195 non-null  float64
 8   stock_variation           843419 non-null   float64
 9   domestic_supply_quantity  1349438 non-null  float64
 10  feed                      367025 non-null   float64
 11  food                      1254023 non-null  float64
 12  seed                      253032 non-null   float64
 13  losses                    5133

### Renaming items

In [355]:
# Export the table
engine = fs.get_engine()
schema = 'capstone_envirolytics'
table_name = 'food_balances_full'

if engine!=None:
    try:
        food_balances_rest.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schema that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The food_balances_full table was imported successfully.


### Import additional emissions data

In [1]:
url = f'https://fenixservices.fao.org/faostat/static/bulkdownloads/Environment_Emissions_intensities_E_All_Data_(Normalized).zip'
zip_file = 'Environment_Emissions_intensities_E_All_Data_(Normalized).zip'
csv_file = 'Environment_Emissions_intensities_E_All_Data_(Normalized).csv'

In [47]:
# Run the data download function 
data_download()

In [48]:
# Import crops emissions data for all countries
emissions_additional = pd.read_csv(path+csv_file, encoding='latin-1', low_memory=False)

In [49]:
emissions_additional.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,'004,Afghanistan,1718,'F1718,Cereals excluding rice,71761,Emissions intensity,1961,1961,kg CO2eq/kg,0.113,E
1,2,'004,Afghanistan,1718,'F1718,Cereals excluding rice,71761,Emissions intensity,1962,1962,kg CO2eq/kg,0.1149,E
2,2,'004,Afghanistan,1718,'F1718,Cereals excluding rice,71761,Emissions intensity,1963,1963,kg CO2eq/kg,0.1205,E
3,2,'004,Afghanistan,1718,'F1718,Cereals excluding rice,71761,Emissions intensity,1964,1964,kg CO2eq/kg,0.1154,E
4,2,'004,Afghanistan,1718,'F1718,Cereals excluding rice,71761,Emissions intensity,1965,1965,kg CO2eq/kg,0.1144,E


In [50]:
# Drop unneeded columns
emissions_additional.drop(columns = ['Year Code', 'Area Code (M49)', 'Item Code (CPC)'], inplace=True)

In [51]:
# Rename remaining columns
emissions_additional.rename(columns={'Area Code':'area_code', 'Area':'area', 'Item Code':'item_code', 'Item':'item', 'Element Code':'element_code', 'Element':'element', 'Year':'year', 'Unit':'unit', 'Value':'value', 'Flag':'flag'}, inplace=True)

In [52]:
# Export the table
engine = sf.get_engine()
schema = 'capstone_envirolytics'
table_name = 'emissions_additional'

if engine!=None:
    try:
        emissions_additional.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schema that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The emissions_additional table was imported successfully.
