In [1]:
# Import packages
import pandas as pd
import numpy as np 
import requests 
import zipfile
import psycopg2
import sqlalchemy

import _functions_sql as fs
import _functions_data_files as fdf

# Specify source directory and file (required for the fdf function to work as expected)
source_dir = 'faostat_emi_crop'
source_file = 'Emissions_crops_E_All_Data_(Normalized).csv'

### Import & examine emissions crops data from csv

In [2]:
# Import raw data into a pandas dataframe
df_emis_crops = pd.read_csv(
      fdf.get_path(source_file, source_dir)
    , encoding='latin-1'
    , converters={'Note': str} # handle DtypeWarning without 'low_memory=False' 
)

In [3]:
df_emis_crops.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Source Code,Source,Unit,Value,Flag,Note
0,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1961,1961,3050,FAO TIER 1,kt,0.1141,E,
1,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1962,1962,3050,FAO TIER 1,kt,0.1141,E,
2,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1963,1963,3050,FAO TIER 1,kt,0.1141,E,
3,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1964,1964,3050,FAO TIER 1,kt,0.1145,E,
4,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1965,1965,3050,FAO TIER 1,kt,0.1145,E,


In [4]:
df_emis_crops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891492 entries, 0 to 891491
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Area Code        891492 non-null  int64  
 1   Area Code (M49)  891492 non-null  object 
 2   Area             891492 non-null  object 
 3   Item Code        891492 non-null  int64  
 4   Item Code (CPC)  891492 non-null  object 
 5   Item             891492 non-null  object 
 6   Element Code     891492 non-null  int64  
 7   Element          891492 non-null  object 
 8   Year Code        891492 non-null  int64  
 9   Year             891492 non-null  int64  
 10  Source Code      891492 non-null  int64  
 11  Source           891492 non-null  object 
 12  Unit             891492 non-null  object 
 13  Value            891492 non-null  float64
 14  Flag             891492 non-null  object 
 15  Note             891492 non-null  object 
dtypes: float64(1), int64(6), object(9)
mem

In [5]:
# Check for full duplicates
df_emis_crops.duplicated().value_counts()

False    891492
Name: count, dtype: int64

In [6]:
# Check for NAs
df_emis_crops.isnull().value_counts()

Area Code  Area Code (M49)  Area   Item Code  Item Code (CPC)  Item   Element Code  Element  Year Code  Year   Source Code  Source  Unit   Value  Flag   Note 
False      False            False  False      False            False  False         False    False      False  False        False   False  False  False  False    891492
Name: count, dtype: int64

In [7]:
# List unique values in the Note column
print(df_emis_crops["Note"].unique())

['' 'Unofficial figure' 'UNFCCC Repository' 'NC/CRF/BUR' 'NC/BUR/CRF']


In [8]:
print(df_emis_crops["Source"].unique())

['FAO TIER 1' 'UNFCCC']


In [9]:
# Dropping unneeded columns
df_emis_crops = df_emis_crops.drop(columns = 'Area Code (M49)')
df_emis_crops = df_emis_crops.drop(columns = 'Item Code (CPC)')
df_emis_crops = df_emis_crops.drop(columns = 'Year Code')
df_emis_crops = df_emis_crops.drop(columns = 'Flag')
df_emis_crops = df_emis_crops.drop(columns = 'Note')
df_emis_crops = df_emis_crops.drop(columns = 'Source Code')

In [10]:
df_emis_crops = df_emis_crops.rename(columns={'Area Code' : 'area_code', 'Area' : 'area',
                                  'Item Code' : 'item_code', 'Item' : 'item',
                                  'Element Code' : 'element_code', 'Element' : 'element',
                                  'Year' : 'year', 'Unit' : 'unit', 'Value' : 'value',
                                  'Source' : 'source'})

In [11]:
df_emis_crops.head()

Unnamed: 0,area_code,area,item_code,item,element_code,element,year,source,unit,value
0,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1961,FAO TIER 1,kt,0.1141
1,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1962,FAO TIER 1,kt,0.1141
2,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1963,FAO TIER 1,kt,0.1141
3,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1964,FAO TIER 1,kt,0.1145
4,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1965,FAO TIER 1,kt,0.1145


In [13]:
# Import additional table for AreaCodes (provided by the same download zip)
source_file = 'Emissions_crops_E_AreaCodes.csv'
df_emis_crops_areacodes = pd.read_csv(
      fdf.get_path(source_file, source_dir)
    , encoding='latin-1' 
)

In [14]:
df_emis_crops_areacodes.head()

Unnamed: 0,Area Code,M49 Code,Area
0,2,'004,Afghanistan
1,5100,'002,Africa
2,3,'008,Albania
3,4,'012,Algeria
4,5,'016,American Samoa


In [15]:
# Verify no duplicated rows are present
df_emis_crops_areacodes.duplicated(keep='first').sum()

0

In [16]:
# Left merge on 'df_emis_crops' to inspect potentially redundant columns
df_emis_crops_temp = df_emis_crops.merge(df_emis_crops_areacodes, how='left', left_on='area_code', right_on='Area Code')

In [17]:
df_emis_crops_temp.head()

Unnamed: 0,area_code,area,item_code,item,element_code,element,year,source,unit,value,Area Code,M49 Code,Area
0,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1961,FAO TIER 1,kt,0.1141,2,'004,Afghanistan
1,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1962,FAO TIER 1,kt,0.1141,2,'004,Afghanistan
2,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1963,FAO TIER 1,kt,0.1141,2,'004,Afghanistan
3,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1964,FAO TIER 1,kt,0.1145,2,'004,Afghanistan
4,2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1965,FAO TIER 1,kt,0.1145,2,'004,Afghanistan


In [18]:
# Show divergent area values before and after merge
df_emis_crops_temp[['area', 'Area']][df_emis_crops_temp['area'] != df_emis_crops_temp['Area']].drop_duplicates()

Unnamed: 0,area,Area
134410,"China, Hong Kong SAR",China; Hong Kong SAR
135796,"China, Macao SAR",China; Macao SAR
135940,"China, mainland",China; mainland
141728,"China, Taiwan Province of",China; Taiwan Province of


In [19]:
# Compare divergent area values after replacing ';' with ','
df_emis_crops_temp['Area'] = df_emis_crops_temp['Area'].str.replace(';', ',')
df_emis_crops_temp[['area', 'Area']][df_emis_crops_temp['area'] != df_emis_crops_temp['Area']].drop_duplicates()

Unnamed: 0,area,Area


In [20]:
# Drop 'area_code' column as it contains no additional info
df_emis_crops = df_emis_crops.drop(columns = ['area_code'])

In [22]:
# Import additional table for AreaCodes (provided by the same download zip)
source_file = 'Emissions_crops_E_ItemCodes.csv'
df_emis_crops_itemcodes = pd.read_csv(
      fdf.get_path(source_file, source_dir)
    , encoding='latin-1' 
)

In [23]:
df_emis_crops_itemcodes.head()

Unnamed: 0,Item Code,CPC Code,Item
0,116,'01510,Potatoes
1,15,'0111,Wheat
2,156,'01802,Sugar cane
3,1712,'F1712,All Crops
4,176,'01701,Beans; dry


In [24]:
# Verify no duplicated rows are present
df_emis_crops_itemcodes.duplicated(keep='first').sum()

0

In [25]:
# Left merge on 'df_emis_crops' to inspect potentially redundant columns
df_emis_crops_temp = df_emis_crops.merge(df_emis_crops_itemcodes, how='left', left_on='item_code', right_on='Item Code')

In [26]:
df_emis_crops_temp.head()

Unnamed: 0,area,item_code,item,element_code,element,year,source,unit,value,Item Code,CPC Code,Item
0,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1961,FAO TIER 1,kt,0.1141,44.0,'0115,Barley
1,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1962,FAO TIER 1,kt,0.1141,44.0,'0115,Barley
2,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1963,FAO TIER 1,kt,0.1141,44.0,'0115,Barley
3,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1964,FAO TIER 1,kt,0.1145,44.0,'0115,Barley
4,Afghanistan,44,Barley,72430,Crops total (Emissions N2O),1965,FAO TIER 1,kt,0.1145,44.0,'0115,Barley


In [27]:
# Show divergent area values before and after merge
df_emis_crops_temp[['item', 'Item']][df_emis_crops_temp['item'] != df_emis_crops_temp['Item']].drop_duplicates()

Unnamed: 0,item,Item
3723,Nutrient nitrogen N (total),
4526,"Beans, dry",Beans; dry


In [28]:
# Compare divergent Item values after replacing ';' with ','
df_emis_crops_temp['Item'] = df_emis_crops_temp['Item'].str.replace(';', ',')
df_emis_crops_temp[['item', 'Item']][df_emis_crops_temp['Item'] != df_emis_crops_temp['Item']].drop_duplicates()

Unnamed: 0,item,Item
3723,Nutrient nitrogen N (total),


In [29]:
df_emis_crops[['element_code', 'element', 'unit']].drop_duplicates().sort_values('element')

Unnamed: 0,element_code,element,unit
2457,5312,Area harvested,ha
693,7245,"Burning crop residues (Biomass burned, dry mat...",t
819,72257,Burning crop residues (Emissions CH4),kt
756,72307,Burning crop residues (Emissions N2O),kt
189,72342,Crop residues (Direct emissions N2O),kt
126,72302,Crop residues (Emissions N2O),kt
252,72362,Crop residues (Indirect emissions N2O),kt
63,72392,Crop residues (N content),kg
378,72440,Crops total (Emissions CH4),kt
0,72430,Crops total (Emissions N2O),kt


In [30]:
#Element_code stands for a unique combination of element and unit; can be dropped
df_emis_crops = df_emis_crops.drop(columns = ['element_code'])	

In [31]:
df_emis_crops.head()

Unnamed: 0,area,item_code,item,element,year,source,unit,value
0,Afghanistan,44,Barley,Crops total (Emissions N2O),1961,FAO TIER 1,kt,0.1141
1,Afghanistan,44,Barley,Crops total (Emissions N2O),1962,FAO TIER 1,kt,0.1141
2,Afghanistan,44,Barley,Crops total (Emissions N2O),1963,FAO TIER 1,kt,0.1141
3,Afghanistan,44,Barley,Crops total (Emissions N2O),1964,FAO TIER 1,kt,0.1145
4,Afghanistan,44,Barley,Crops total (Emissions N2O),1965,FAO TIER 1,kt,0.1145


In [32]:
# Rearranging columns
df_emis_crops = df_emis_crops[['area', 'year', 'item', 'item_code', 'element', 'unit', 'value', 'source']]

In [33]:
# Export the dataframe
fs.write_dataframe(df_emis_crops, 'fao_emis_crop_source')

+ table written: fao_emis_crop_source
