In [1]:
import pandas as pd
import _functions_sql as fs
import _functions_data_files as fdf

### Import & examine emissions crops data from csv

In [2]:
source_dir = 'faostat_emi_live'

In [3]:
file_name = 'Emissions_livestock_E_All_Data_(Normalized).csv'

# define converters for 'read_csv' & import file
conv = {
    'Note': str # handle DtypeWarning not using 'low_memory=False' (deprecated)
                # so missing values are now read as empty string ('')
}
df_emis_livestock = pd.read_csv(
      fdf.get_path(file_name, source_dir)
    , encoding='latin-1' # content is not UTF-8 compatible :TODO: :FIXME: ???
    , converters=conv
)

In [4]:
df_emis_livestock.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Source Code,Source,Unit,Value,Flag,Note
0,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1961,1961,3050,FAO TIER 1,An,1300000.0,A,
1,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1962,1962,3050,FAO TIER 1,An,851850.0,A,
2,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1963,1963,3050,FAO TIER 1,An,1001112.0,A,
3,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1964,1964,3050,FAO TIER 1,An,1150000.0,E,
4,2,'004,Afghanistan,1107,'02132,Asses,5111,Stocks,1965,1965,3050,FAO TIER 1,An,1300000.0,A,


In [5]:
df_emis_livestock.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6285217 entries, 0 to 6285216
Data columns (total 16 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Area Code        int64  
 1   Area Code (M49)  object 
 2   Area             object 
 3   Item Code        int64  
 4   Item Code (CPC)  object 
 5   Item             object 
 6   Element Code     int64  
 7   Element          object 
 8   Year Code        int64  
 9   Year             int64  
 10  Source Code      int64  
 11  Source           object 
 12  Unit             object 
 13  Value            float64
 14  Flag             object 
 15  Note             object 
dtypes: float64(1), int64(6), object(9)
memory usage: 767.2+ MB


In [6]:
# Check for full duplicates
df_emis_livestock.duplicated().value_counts()

False    6285217
Name: count, dtype: int64

In [7]:
# Check for NAs
df_emis_livestock.isnull().value_counts()

Area Code  Area Code (M49)  Area   Item Code  Item Code (CPC)  Item   Element Code  Element  Year Code  Year   Source Code  Source  Unit   Value  Flag   Note 
False      False            False  False      False            False  False         False    False      False  False        False   False  False  False  False    6285217
Name: count, dtype: int64

In [8]:
# List unique values in the Note column
print(df_emis_livestock["Note"].unique())

['' 'Unofficial figure' 'UNFCCC Repository' 'NC/CRF/BUR'
 'NC/CRF/BUR Unofficial figure']


In [9]:
print(df_emis_livestock["Source"].unique())

['FAO TIER 1' 'UNFCCC']


We can drop some columns we don't need

In [10]:
df_emis_livestock = df_emis_livestock.drop(columns = 'Area Code (M49)')
df_emis_livestock = df_emis_livestock.drop(columns = 'Item Code (CPC)')
df_emis_livestock = df_emis_livestock.drop(columns = 'Year Code')
df_emis_livestock = df_emis_livestock.drop(columns = 'Flag')
df_emis_livestock = df_emis_livestock.drop(columns = 'Note')
df_emis_livestock = df_emis_livestock.drop(columns = 'Source Code')

In [11]:
df_emis_livestock = df_emis_livestock.rename(columns={'Area Code' : 'area_code', 'Area' : 'area',
                                  'Item Code' : 'item_code', 'Item' : 'item',
                                  'Element Code' : 'element_code', 'Element' : 'element',
                                  'Year' : 'year', 'Unit' : 'unit', 'Value' : 'value',
                                  'Source' : 'source'})

In [12]:
df_emis_livestock.head()

Unnamed: 0,area_code,area,item_code,item,element_code,element,year,source,unit,value
0,2,Afghanistan,1107,Asses,5111,Stocks,1961,FAO TIER 1,An,1300000.0
1,2,Afghanistan,1107,Asses,5111,Stocks,1962,FAO TIER 1,An,851850.0
2,2,Afghanistan,1107,Asses,5111,Stocks,1963,FAO TIER 1,An,1001112.0
3,2,Afghanistan,1107,Asses,5111,Stocks,1964,FAO TIER 1,An,1150000.0
4,2,Afghanistan,1107,Asses,5111,Stocks,1965,FAO TIER 1,An,1300000.0


In [13]:
# import additional table for AreaCodes (provided by the same download zip)
file_name = 'Emissions_livestock_E_AreaCodes.csv'
df_emis_livestock_areacodes = pd.read_csv(
      fdf.get_path(file_name, source_dir)
    , encoding='latin-1' # content is not UTF-8 compatible :TODO: :FIXME: ???
)

In [14]:
df_emis_livestock_areacodes.head()

Unnamed: 0,Area Code,M49 Code,Area
0,2,'004,Afghanistan
1,5100,'002,Africa
2,3,'008,Albania
3,4,'012,Algeria
4,5,'016,American Samoa


In [15]:
# verify no duplicated rows are present
df_emis_livestock_areacodes.duplicated(keep='first').sum()

0

In [16]:
# left merge on 'df_emis_livestock' to inspect potentially redundant columns
df_emis_livestock_temp = df_emis_livestock.merge(df_emis_livestock_areacodes, how='left', left_on='area_code', right_on='Area Code')

In [17]:
df_emis_livestock_temp.head()

Unnamed: 0,area_code,area,item_code,item,element_code,element,year,source,unit,value,Area Code,M49 Code,Area
0,2,Afghanistan,1107,Asses,5111,Stocks,1961,FAO TIER 1,An,1300000.0,2,'004,Afghanistan
1,2,Afghanistan,1107,Asses,5111,Stocks,1962,FAO TIER 1,An,851850.0,2,'004,Afghanistan
2,2,Afghanistan,1107,Asses,5111,Stocks,1963,FAO TIER 1,An,1001112.0,2,'004,Afghanistan
3,2,Afghanistan,1107,Asses,5111,Stocks,1964,FAO TIER 1,An,1150000.0,2,'004,Afghanistan
4,2,Afghanistan,1107,Asses,5111,Stocks,1965,FAO TIER 1,An,1300000.0,2,'004,Afghanistan


In [18]:
# show divergent area values before and after merge
df_emis_livestock_temp[['area', 'Area']][df_emis_livestock_temp['area'] != df_emis_livestock_temp['Area']].drop_duplicates()

Unnamed: 0,area,Area
923883,"China, Hong Kong SAR",China; Hong Kong SAR
950938,"China, Macao SAR",China; Macao SAR
958309,"China, mainland",China; mainland
992077,"China, Taiwan Province of",China; Taiwan Province of


In [19]:
# compare divergent area values after replacing ';' with ','
df_emis_livestock_temp['Area'] = df_emis_livestock_temp['Area'].str.replace(';', ',')
df_emis_livestock_temp[['area', 'Area']][df_emis_livestock_temp['area'] != df_emis_livestock_temp['Area']].drop_duplicates()

Unnamed: 0,area,Area


In [20]:
# drop 'area_code' column as it contains no additional info
df_emis_livestock = df_emis_livestock.drop(columns = ['area_code'])

In [21]:
# import additional table for AreaCodes (provided by the same download zip)
file_name = 'Emissions_livestock_E_ItemCodes.csv'
df_emis_livestock_itemcodes = pd.read_csv(
      fdf.get_path(file_name, source_dir)
    , encoding='latin-1' # content is not UTF-8 compatible :TODO: :FIXME: ???
)

In [22]:
df_emis_livestock_itemcodes.head()

Unnamed: 0,Item Code,CPC Code,Item
0,1016,'02123,Goats
1,1048,'F1048,Swine
2,1049,'F1049,Swine; market
3,1051,'F1051,Swine; breeding
4,1052,'F1052,Chickens; layers


In [23]:
# verify no duplicated rows are present
df_emis_livestock_itemcodes.duplicated(keep='first').sum()

0

In [24]:
# left merge on 'df_emis_livestock' to inspect potentially redundant columns
df_emis_livestock_temp = df_emis_livestock.merge(df_emis_livestock_itemcodes, how='left', left_on='item_code', right_on='Item Code')

In [25]:
df_emis_livestock_temp.head()

Unnamed: 0,area,item_code,item,element_code,element,year,source,unit,value,Item Code,CPC Code,Item
0,Afghanistan,1107,Asses,5111,Stocks,1961,FAO TIER 1,An,1300000.0,1107,'02132,Asses
1,Afghanistan,1107,Asses,5111,Stocks,1962,FAO TIER 1,An,851850.0,1107,'02132,Asses
2,Afghanistan,1107,Asses,5111,Stocks,1963,FAO TIER 1,An,1001112.0,1107,'02132,Asses
3,Afghanistan,1107,Asses,5111,Stocks,1964,FAO TIER 1,An,1150000.0,1107,'02132,Asses
4,Afghanistan,1107,Asses,5111,Stocks,1965,FAO TIER 1,An,1300000.0,1107,'02132,Asses


In [26]:
# show divergent area values before and after merge
df_emis_livestock_temp[['item', 'Item']][df_emis_livestock_temp['item'] != df_emis_livestock_temp['Item']].drop_duplicates()

Unnamed: 0,item,Item
3150,"Cattle, dairy",Cattle; dairy
4725,"Cattle, non-dairy",Cattle; non-dairy
6300,"Chickens, broilers",Chickens; broilers
7812,"Chickens, layers",Chickens; layers
41954,"Swine, breeding",Swine; breeding
43529,"Swine, market",Swine; market


In [27]:
# compare divergent Item values after replacing ';' with ','
df_emis_livestock_temp['Item'] = df_emis_livestock_temp['Item'].str.replace(';', ',')
df_emis_livestock_temp[['item', 'Item']][df_emis_livestock_temp['Item'] != df_emis_livestock_temp['Item']].drop_duplicates()

Unnamed: 0,item,Item


In [28]:
# drop the 'Item Code' column as it contains no additional info other then 'Item'
# needs to be kept for merging with other tables??
# df_emis_livestock = df_emis_livestock.drop(columns = ['item_code'])	

In [29]:
df_emis_livestock[['element_code', 'element', 'unit']].drop_duplicates().sort_values('element')

Unnamed: 0,element_code,element,unit
1260,72301,Emissions (N2O) (Manure applied),kt
189,72254,Enteric fermentation (Emissions CH4),kt
1386,723612,Indirect emissions (N2O that leaches) (Manure ...,kt
882,723602,Indirect emissions (N2O that leaches) (Manure ...,kt
1449,723611,Indirect emissions (N2O that volatilises) (Man...,kt
945,723601,Indirect emissions (N2O that volatilises) (Man...,kt
126,72441,Livestock total (Emissions CH4),kt
63,72431,Livestock total (Emissions N2O),kt
1323,72341,Manure applied to soils (Direct emissions N2O),kt
1512,72361,Manure applied to soils (Indirect emissions N2O),kt


In [30]:
#element_code stands for a unique combination of element and unit; can be dropped
df_emis_livestock = df_emis_livestock.drop(columns = ['element_code'])	

In [31]:
df_emis_livestock.head()

Unnamed: 0,area,item_code,item,element,year,source,unit,value
0,Afghanistan,1107,Asses,Stocks,1961,FAO TIER 1,An,1300000.0
1,Afghanistan,1107,Asses,Stocks,1962,FAO TIER 1,An,851850.0
2,Afghanistan,1107,Asses,Stocks,1963,FAO TIER 1,An,1001112.0
3,Afghanistan,1107,Asses,Stocks,1964,FAO TIER 1,An,1150000.0
4,Afghanistan,1107,Asses,Stocks,1965,FAO TIER 1,An,1300000.0


In [32]:
# rearranging columns
df_emis_livestock = df_emis_livestock[['area', 'year', 'item', 'item_code', 'element', 'unit', 'value', 'source']]

In [33]:
df_emis_livestock.head()

Unnamed: 0,area,year,item,item_code,element,unit,value,source
0,Afghanistan,1961,Asses,1107,Stocks,An,1300000.0,FAO TIER 1
1,Afghanistan,1962,Asses,1107,Stocks,An,851850.0,FAO TIER 1
2,Afghanistan,1963,Asses,1107,Stocks,An,1001112.0,FAO TIER 1
3,Afghanistan,1964,Asses,1107,Stocks,An,1150000.0,FAO TIER 1
4,Afghanistan,1965,Asses,1107,Stocks,An,1300000.0,FAO TIER 1


In [35]:
fs.write_dataframe(df_emis_livestock, 'fao_emis_livestock_source')

+ table written: fao_emis_livestock_source
