# [FAO] emissions from crops

In [99]:
import pandas as pd
import numpy as np
import re as re

import _functions_sql as fs
import _functions_data_files as fdf

source_dir = 'fao_emissions_crops'
source_file = 'Emissions_crops_E_All_Data_(Normalized).csv'

## import from CSV & show general information

In [100]:
# import raw data into a pandas dataframe
df_raw = pd.read_csv(
      fdf.get_path(source_file, source_dir)
    , encoding='latin-1'
    , converters={'Note': str} # handle DtypeWarning without 'low_memory=False' 
)

In [101]:
# show first data rows
df_raw.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Source Code,Source,Unit,Value,Flag,Note
0,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1961,1961,3050,FAO TIER 1,kt,0.1141,E,
1,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1962,1962,3050,FAO TIER 1,kt,0.1141,E,
2,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1963,1963,3050,FAO TIER 1,kt,0.1141,E,
3,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1964,1964,3050,FAO TIER 1,kt,0.1145,E,
4,2,'004,Afghanistan,44,'0115,Barley,72430,Crops total (Emissions N2O),1965,1965,3050,FAO TIER 1,kt,0.1145,E,


In [102]:
# show table summary
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891492 entries, 0 to 891491
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Area Code        891492 non-null  int64  
 1   Area Code (M49)  891492 non-null  object 
 2   Area             891492 non-null  object 
 3   Item Code        891492 non-null  int64  
 4   Item Code (CPC)  891492 non-null  object 
 5   Item             891492 non-null  object 
 6   Element Code     891492 non-null  int64  
 7   Element          891492 non-null  object 
 8   Year Code        891492 non-null  int64  
 9   Year             891492 non-null  int64  
 10  Source Code      891492 non-null  int64  
 11  Source           891492 non-null  object 
 12  Unit             891492 non-null  object 
 13  Value            891492 non-null  float64
 14  Flag             891492 non-null  object 
 15  Note             891492 non-null  object 
dtypes: float64(1), int64(6), object(9)
mem

In [103]:
# show full duplicates
df_raw.duplicated().value_counts()

False    891492
Name: count, dtype: int64

In [104]:
# show null values
df_raw.isnull().value_counts()

Area Code  Area Code (M49)  Area   Item Code  Item Code (CPC)  Item   Element Code  Element  Year Code  Year   Source Code  Source  Unit   Value  Flag   Note 
False      False            False  False      False            False  False         False    False      False  False        False   False  False  False  False    891492
Name: count, dtype: int64

## data cleaning

In [105]:
# unify column names:
# - remove leading/trailing spaces
# - convert to lower case
# - replace all non-alphanumerical characters with '_'
df_raw.columns = [
    re.sub('[^a-zA-Z0-9]', '_', col) for col in \
        df_raw.columns.str.strip().str.lower()
]

In [106]:
# retain original raw data for later
df_clean = df_raw.copy()

### drop 'area_code' & 'area_code_m49_' (redundant to 'Area')

:FIXME: :TODO: has yet to be proven!!!

In [107]:
# drop column
df_clean.drop(columns = ['area_code', 'area_code__m49_'], inplace=True)

### drop 'item_code_cpc_' (redundant to 'item_code')

:FIXME: :TODO: has yet to be proven!!!

In [108]:
# drop column
df_clean.drop(columns = ['item_code__cpc_'], inplace=True)

### drop 'element_code' (redundant to 'element')

:FIXME: :TODO: has yet to be proven!!!

'element_code' stands for a unique combination of element and unit, it can be dropped

In [109]:
# drop column
df_clean.drop(columns = ['element_code'], inplace=True)

### drop 'year_code' (redundant to 'year')

:FIXME: :TODO: has yet to be proven!!!

In [110]:
# drop column
df_clean.drop(columns = ['year_code'], inplace=True)

### drop 'source_code' (redundant to 'source')

:FIXME: :TODO: has yet to be proven!!!

In [111]:
# drop column
df_clean.drop(columns = ['source_code'], inplace=True)

### drop 'note' (no added value)

:FIXME: :TODO: has yet to be proven!!!

In [112]:
# replace empty strings with 'null'
df_clean['note'].replace('', np.nan, inplace=True)

In [113]:
# show unique values & counts
df_clean['note'].value_counts(dropna=False)

note
NaN                  879007
NC/CRF/BUR            11416
UNFCCC Repository       819
Unofficial figure       186
NC/BUR/CRF               64
Name: count, dtype: int64

In [114]:
# drop column
df_clean.drop(columns = ['note'], inplace=True)

### drop 'flag' (no added value)

:FIXME: :TODO: has yet to be proven!!!

In [115]:
# drop column
df_clean.drop(columns = ['flag'], inplace=True)

## data wrangling

In [116]:
# retain original cleaned data for later
df_wrangled = df_clean.copy()

### drop duplicate rows from divergent 'source'

when there is data from multiple sources, all but that from 'FAO TIER 1' is dropped

In [117]:
# show unique values & counts
df_wrangled['source'].value_counts(dropna=False)

source
FAO TIER 1    874261
UNFCCC         17231
Name: count, dtype: int64

In [118]:
# count duplicate rows (from divergent 'source')
df_wrangled[df_wrangled.duplicated(
      subset=['area', 'item_code', 'item', 'element', 'year']
    , keep=False
)].shape[0]

33858

In [119]:
# sort, so that 'FAO TIER 1' is first for duplicate rows
df_wrangled.sort_values(
      ['area', 'item_code', 'item', 'element', 'year', 'source']
    , na_position='last'
    , ascending=True
    , inplace=True
)
# drop duplicate rows, keeping first
df_wrangled.drop_duplicates(
      subset=['area', 'item_code', 'item', 'element', 'year']
    , keep='first'
    , inplace=True
)

### drop all but totals values from 'element'

In [120]:
# show unique values & counts
df_wrangled['element'].value_counts(dropna=False)

element
Crops total (Emissions N2O)                                          115596
Crop residues (Emissions N2O)                                        115514
Crop residues (Direct emissions N2O)                                 103362
Crop residues (N content)                                            103362
Crop residues (Indirect emissions N2O)                               103331
Crops total (Emissions CH4)                                           52877
Burning crop residues (Emissions CH4)                                 52877
Burning crop residues (Biomass burned, dry matter)                    52871
Burning crop residues (Emissions N2O)                                 52871
Synthetic fertilizers (Direct emissions N2O)                          12885
Synthetic fertilizers (Agricultural use)                              12885
Nitrogen fertilizer content applied that volatilises                  12850
Synthetic fertilizers (Emissions N2O)                                 12850
Synt

In [121]:
# drop rows that do not contain totals values
df_wrangled.drop(df_wrangled[~(df_wrangled['element'].isin([
      'Crops total (Emissions CH4)'
    , 'Crops total (Emissions N2O)'
]))].index, inplace=True)
print('remaining rows:', df_wrangled.shape[0])

remaining rows: 168473


### split 'element' column into separate columns using 'value' and 'unit'

In [122]:
# check for duplicates regarding composite key with 'element'
df_wrangled[['area', 'year', 'item', 'element']].duplicated().value_counts()

False    168473
Name: count, dtype: int64

In [123]:
# verify all values are positive, otherwise aggregation via 'max' will not work
(df_wrangled['value'] >= 0).all()

True

In [124]:
# verify all values have the same unit
df_wrangled['unit'].value_counts(dropna=False)

unit
kt    168473
Name: count, dtype: int64

In [125]:
# split 'element' column (by the 2 expected values) into seperate columns, that
# contain 0/1 depending on the actual value of 'element'
df_dummies = pd.get_dummies(df_wrangled['element'])
# rename columns
df_dummies.rename(
        columns={
              'Crops total (Emissions CH4)': 'emissions_ch4'
            , 'Crops total (Emissions N2O)': 'emissions_n2o'
        }
        , inplace=True)

# add dummies after replacing '1' with actual value from 'value'
df_new_cols = df_dummies.mul(df_wrangled['value'], axis=0)
df_wrangled = pd.concat([df_wrangled, df_new_cols], axis=1)

In [126]:
# calculate the precision loss
print(
      'precision loss for emissions_ch4:'
    , df_wrangled[df_wrangled['element'] == 'Crops total (Emissions CH4)'] \
        ['value'].sum()
      - df_wrangled['emissions_ch4'].sum()
)
print(
      'precision loss for emissions_n2o:'
    , df_wrangled[df_wrangled['element'] == 'Crops total (Emissions N2O)'] \
        ['value'].sum()
      - df_wrangled['emissions_n2o'].sum()
)

precision loss for emissions_ch4: -1.862645149230957e-09
precision loss for emissions_n2o: 0.0


In [127]:
# rename columns
df_wrangled.rename(columns={
    'unit': 'emissions_unit'
}, inplace=True)
# drop now redundant columns
df_wrangled.drop(columns = ['element', 'value'], inplace=True)
# group all duplicate columns keeping the actual values for each row
df_wrangled = df_wrangled.groupby([
      'area'
    , 'item_code'
    , 'item'
    , 'year'
    , 'source'
    , 'emissions_unit'
]).agg({
      'emissions_ch4': 'max'
    , 'emissions_n2o': 'max'
}).reset_index()

### verify 'year' values

note: '2030' and '2050' are official forecasts

In [128]:
# show time span of rows
pd.Series(df_raw['year'].unique()).sort_values().to_numpy()

array([1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020, 2021, 2030, 2050])

### reorder columns

In [129]:
column_order = [
      'area', 'year', 'item', 'item_code', 'source'
    , 'emissions_ch4', 'emissions_n2o', 'emissions_unit'
]
if len(column_order) != df_wrangled.shape[1]:
    print('warning: dropping columns')
df_wrangled = df_wrangled[column_order]

## final checks & upload to database server

In [130]:
# show first data rows
df_wrangled.head()

Unnamed: 0,area,year,item,item_code,source,emissions_ch4,emissions_n2o,emissions_unit
0,Afghanistan,1961,Wheat,15,FAO TIER 1,2.4084,0.7428,kt
1,Afghanistan,1962,Wheat,15,FAO TIER 1,2.5283,0.7541,kt
2,Afghanistan,1963,Wheat,15,FAO TIER 1,2.5283,0.6791,kt
3,Afghanistan,1964,Wheat,15,FAO TIER 1,2.5326,0.7435,kt
4,Afghanistan,1965,Wheat,15,FAO TIER 1,2.5348,0.7554,kt


In [131]:
# show table summary
df_wrangled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115602 entries, 0 to 115601
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   area            115602 non-null  object 
 1   year            115602 non-null  int64  
 2   item            115602 non-null  object 
 3   item_code       115602 non-null  int64  
 4   source          115602 non-null  object 
 5   emissions_ch4   115602 non-null  float64
 6   emissions_n2o   115602 non-null  float64
 7   emissions_unit  115602 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 7.1+ MB


In [132]:
# check for duplicates regarding composite key
df_wrangled[['area', 'year', 'item']].duplicated().value_counts()

False    115602
Name: count, dtype: int64

### write raw data

In [133]:
# write to database & grant access
table_name = 'fao_emissions_crops_raw_sh'
fs.write_dataframe(df_raw, table_name)
fs.run_command('CALL grant_access(\'' + table_name + '\')')

+ table written: fao_emissions_crops_raw_sh


### write wrangled data

In [134]:
# write to database & grant access
table_name = 'fao_emissions_crops_wrangled_sh'
fs.write_dataframe(df_wrangled, table_name)
fs.run_command('CALL grant_access(\'' + table_name + '\')')

+ table written: fao_emissions_crops_wrangled_sh


In [None]:
df_raw.qu