In [1]:
# Import packages
import pandas as pd
import numpy as np 
import requests 
import zipfile
import psycopg2
import sql_functions as sf

In [2]:
engine = get_engine()

### Import & examine crops & livestock production data

In [12]:
url = f'https://fenixservices.fao.org/faostat/static/bulkdownloads/Production_Crops_Livestock_E_All_Data_(Normalized).zip'
zip_file = 'Production_Crops_Livestock_E_All_Data_(Normalized).zip'
path = '/Users/cornelialutz/neuefische/capstone-dataverse-sh/data/'
csv_file = 'Production_Crops_Livestock_E_All_Data_(Normalized).csv'

In [22]:
r = requests.get(url)
with open(path+zip_file, 'wb') as f:
    f.write(r.content)

data = zipfile.ZipFile(path+zip_file, 'r')
data.extract(csv_file, path)

'/Users/cornelialutz/neuefische/capstone-dataverse-sh/data/Production_Crops_Livestock_E_All_Data_(Normalized).csv'

In [13]:
# Import crops & livestock data for all countries
production = pd.read_csv(path+csv_file, encoding='latin-1', low_memory=False)

In [24]:
production.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1975,1975,ha,0.0,E,
1,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1976,1976,ha,5900.0,E,
2,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1977,1977,ha,6000.0,E,
3,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1978,1978,ha,6000.0,E,
4,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1979,1979,ha,6000.0,E,


In [25]:
production.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3761168 entries, 0 to 3761167
Data columns (total 14 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Area Code        int64  
 1   Area Code (M49)  object 
 2   Area             object 
 3   Item Code        int64  
 4   Item Code (CPC)  object 
 5   Item             object 
 6   Element Code     int64  
 7   Element          object 
 8   Year Code        int64  
 9   Year             int64  
 10  Unit             object 
 11  Value            float64
 12  Flag             object 
 13  Note             object 
dtypes: float64(1), int64(5), object(8)
memory usage: 401.7+ MB


In [26]:
# Check for full duplicates
production.duplicated().value_counts()

False    3761168
Name: count, dtype: int64

In [27]:
# Check for NAs
production.isnull().value_counts()

Area Code  Area Code (M49)  Area   Item Code  Item Code (CPC)  Item   Element Code  Element  Year Code  Year   Unit   Value  Flag   Note 
False      False            False  False      False            False  False         False    False      False  False  False  False  True     3673453
                                                                                                                                    False      87715
Name: count, dtype: int64

In [7]:
# List unique values in the Note column
print(production["Note"].unique())

[nan 'Unofficial figure']


We can drop some columns we don't need

In [14]:
production = production.drop(columns = 'Note')
production = production.drop(columns = 'Year Code')
production = production.drop(columns = 'Area Code (M49)')
production = production.drop(columns = 'Item Code (CPC)')

In [15]:
production = production.rename(columns={'Area Code' : 'area_code', 'Area' : 'area',
                                                  'Item Code' : 'item_code', 'Item' : 'item',
                                                  'Element Code' : 'element_code', 'Element' : 'element',
                                                  'Year' : 'year', 'Unit' : 'unit', 'Value' : 'value',
                                                  'Flag' : 'flag'})

In [16]:
production.head()

Unnamed: 0,area_code,area,item_code,item,element_code,element,year,unit,value,flag
0,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1975,ha,0.0,E
1,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1976,ha,5900.0,E
2,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1977,ha,6000.0,E
3,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1978,ha,6000.0,E
4,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1979,ha,6000.0,E


In [17]:
schema = 'capstone_envirolytics'
table_name = 'fao_production'

if engine!=None:
    try:
        production.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The fao_production table was imported successfully.


### Import emissions from crops data

In [4]:
url = f'https://fenixservices.fao.org/faostat/static/bulkdownloads/Emissions_crops_E_All_Data_(Normalized).zip'
path = '/Users/cornelialutz/neuefische/capstone-dataverse-sh/data/' #insert own path here
zip_file = 'Emissions_crops_E_All_Data_(Normalized).zip'
csv_file = 'Emissions_crops_E_All_Data_(Normalized).csv'

In [18]:
r = requests.get(url)
with open(path+zip_file, 'wb') as f:
    f.write(r.content)

data = zipfile.ZipFile(path+zip_file, 'r')
data.extract(csv_file, path)

'/Users/cornelialutz/neuefische/capstone-dataverse-sh/data/Emissions_crops_E_All_Data_(Normalized).csv'

In [5]:
# Import crops emissions data for all countries
emissions_crops = pd.read_csv(path+csv_file, encoding='latin-1', low_memory=False)

In [21]:
emissions_crops.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Source Code,Source,Unit,Value,Flag,Note
0,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1961,1961,3050,FAO TIER 1,kt,0.1141,E,
1,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1962,1962,3050,FAO TIER 1,kt,0.1141,E,
2,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1963,1963,3050,FAO TIER 1,kt,0.1141,E,
3,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1964,1964,3050,FAO TIER 1,kt,0.1145,E,
4,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1965,1965,3050,FAO TIER 1,kt,0.1145,E,


In [22]:
emissions_crops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891492 entries, 0 to 891491
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Area Code        891492 non-null  int64  
 1   Area Code (M49)  891492 non-null  object 
 2   Area             891492 non-null  object 
 3   Item Code        891492 non-null  int64  
 4   Item Code (CPC)  891492 non-null  object 
 5   Item             891492 non-null  object 
 6   Element Code     891492 non-null  int64  
 7   Element          891492 non-null  object 
 8   Year Code        891492 non-null  int64  
 9   Year             891492 non-null  int64  
 10  Source Code      891492 non-null  int64  
 11  Source           891492 non-null  object 
 12  Unit             891492 non-null  object 
 13  Value            891492 non-null  float64
 14  Flag             891492 non-null  object 
 15  Note             12485 non-null   object 
dtypes: float64(1), int64(6), object(9)
mem

In [40]:
# Check for full duplicates
emissions_crops.duplicated().value_counts()

False    891492
Name: count, dtype: int64

In [41]:
# List unique values in the Note column
print(emissions_crops["Note"].unique())

[nan 'Unofficial figure' 'UNFCCC Repository' 'NC/CRF/BUR' 'NC/BUR/CRF']


In [6]:
#Dropping columns we don't need:
emissions_crops = emissions_crops.drop(columns = 'Area Code (M49)')
emissions_crops = emissions_crops.drop(columns = 'Item Code (CPC)')
emissions_crops = emissions_crops.drop(columns = 'Year Code')
emissions_crops = emissions_crops.drop(columns = 'Source Code')
emissions_crops = emissions_crops.drop(columns = 'Source')
emissions_crops = emissions_crops.drop(columns = 'Note')

In [7]:
emissions_crops = emissions_crops.rename(columns={'Area Code' : 'area_code', 'Area' : 'area',
                                                  'Item Code' : 'item_code', 'Item' : 'item',
                                                  'Element Code' : 'element_code', 'Element' : 'element',
                                                  'Year' : 'year', 'Unit' : 'unit', 'Value' : 'value',
                                                  'Flag' : 'flag'})

In [8]:
emissions_crops.head()

Unnamed: 0,area_code,area,item_code,item,element_code,element,year,unit,value,flag
0,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1961,kt,0.1141,E
1,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1962,kt,0.1141,E
2,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1963,kt,0.1141,E
3,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1964,kt,0.1145,E
4,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1965,kt,0.1145,E


In [11]:
schema = 'capstone_envirolytics'
table_name = 'fao_emissions_crops'

if engine!=None:
    try:
        emissions_crops.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The fao_emissions_crops table was imported successfully.


### Import emissions from livestock data

In [3]:
url = f'https://fenixservices.fao.org/faostat/static/bulkdownloads/Emissions_livestock_E_All_Data_(Normalized).zip'
path = '/Users/cornelialutz/neuefische/capstone-dataverse-sh/data/' #insert own path here
zip_file = 'Emissions_livestock_E_All_Data_(Normalized).zip'
csv_file = 'Emissions_livestock_E_All_Data_(Normalized).csv'

In [31]:
r = requests.get(url)
with open(path+zip_file, 'wb') as f:
    f.write(r.content)

data = zipfile.ZipFile(path+zip_file, 'r')
data.extract(csv_file, path)

'/Users/cornelialutz/neuefische/capstone-dataverse-sh/data/Emissions_livestock_E_All_Data_(Normalized).csv'

In [4]:
# Import crops emissions data for all countries
emissions_livestock = pd.read_csv(path+csv_file, encoding='latin-1', low_memory=False)

In [47]:
emissions_livestock.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Source Code,Source,Unit,Value,Flag,Note
0,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1961,1961,3050,FAO TIER 1,An,1300000.0,A,
1,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1962,1962,3050,FAO TIER 1,An,851850.0,A,
2,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1963,1963,3050,FAO TIER 1,An,1001112.0,A,
3,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1964,1964,3050,FAO TIER 1,An,1150000.0,E,
4,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1965,1965,3050,FAO TIER 1,An,1300000.0,A,


In [48]:
emissions_livestock.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6285217 entries, 0 to 6285216
Data columns (total 16 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Area Code        int64  
 1   Area Code (M49)  object 
 2   Area             object 
 3   Item Code        int64  
 4   Item Code (CPC)  object 
 5   Item             object 
 6   Element Code     int64  
 7   Element          object 
 8   Year Code        int64  
 9   Year             int64  
 10  Source Code      int64  
 11  Source           object 
 12  Unit             object 
 13  Value            float64
 14  Flag             object 
 15  Note             object 
dtypes: float64(1), int64(6), object(9)
memory usage: 767.2+ MB


In [49]:
# Check for full duplicates
emissions_livestock.duplicated().value_counts()

False    6285217
Name: count, dtype: int64

In [50]:
# List unique values in the Note column
print(emissions_livestock["Note"].unique())

[nan 'Unofficial figure' 'UNFCCC Repository' 'NC/CRF/BUR'
 'NC/CRF/BUR Unofficial figure']


In [5]:
#Dropping columns we don't need:
emissions_livestock = emissions_livestock.drop(columns = 'Area Code (M49)')
emissions_livestock = emissions_livestock.drop(columns = 'Item Code (CPC)')
emissions_livestock = emissions_livestock.drop(columns = 'Year Code')
emissions_livestock = emissions_livestock.drop(columns = 'Source Code')
emissions_livestock = emissions_livestock.drop(columns = 'Source')
emissions_livestock = emissions_livestock.drop(columns = 'Note')

In [6]:
emissions_livestock = emissions_livestock.rename(columns={'Area Code' : 'area_code', 'Area' : 'area',
                                                  'Item Code' : 'item_code', 'Item' : 'item',
                                                  'Element Code' : 'element_code', 'Element' : 'element',
                                                  'Year' : 'year', 'Unit' : 'unit', 'Value' : 'value',
                                                  'Flag' : 'flag'})

In [22]:
emissions_livestock.head()

Unnamed: 0,area_code,area,item_code,item,element_code,element,year,unit,value,flag
0,2,Afghanistan,1107,Asses,5111,Stocks,1961,An,1300000.0,A
1,2,Afghanistan,1107,Asses,5111,Stocks,1962,An,851850.0,A
2,2,Afghanistan,1107,Asses,5111,Stocks,1963,An,1001112.0,A
3,2,Afghanistan,1107,Asses,5111,Stocks,1964,An,1150000.0,E
4,2,Afghanistan,1107,Asses,5111,Stocks,1965,An,1300000.0,A


In [7]:
schema = 'capstone_envirolytics'
table_name = 'fao_emissions_livestock'

if engine!=None:
    try:
        emissions_livestock.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The fao_emissions_livestock table was imported successfully.
