In [1]:
import pandas as pd
import _functions_sql as fs
import _functions_data_files as fdf

### Import & examine crops & livestock production data from csv

In [None]:
source_dir = 'faostat_prod'

In [76]:
file_name = 'Production_Crops_Livestock_E_All_Data_(Normalized).csv'

# define converters for 'read_csv' & import file
conv = {
    'Note': str # handle DtypeWarning not using 'low_memory=False' (deprecated)
                # so missing values are now read as empty string ('')
}
df_prod = pd.read_csv(
      fdf.get_file_path(file_name, source_dir)
    , encoding='latin-1' # content is not UTF-8 compatible :TODO: :FIXME: ???
    , converters=conv
)

In [3]:
df_prod.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1961,1961,ha,0.0,A,
1,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1962,1962,ha,0.0,A,
2,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1963,1963,ha,0.0,A,
3,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1964,1964,ha,0.0,A,
4,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1965,1965,ha,0.0,A,


In [4]:
df_prod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4127584 entries, 0 to 4127583
Data columns (total 14 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Area Code        int64  
 1   Area Code (M49)  object 
 2   Area             object 
 3   Item Code        int64  
 4   Item Code (CPC)  object 
 5   Item             object 
 6   Element Code     int64  
 7   Element          object 
 8   Year Code        int64  
 9   Year             int64  
 10  Unit             object 
 11  Value            float64
 12  Flag             object 
 13  Note             object 
dtypes: float64(1), int64(5), object(8)
memory usage: 440.9+ MB


In [5]:
# Check for full duplicates
df_prod.duplicated().value_counts()

False    4127584
Name: count, dtype: int64

In [6]:
# Check for NAs
df_prod.isnull().value_counts()

Area Code  Area Code (M49)  Area   Item Code  Item Code (CPC)  Item   Element Code  Element  Year Code  Year   Unit   Value  Flag   Note 
False      False            False  False      False            False  False         False    False      False  False  False  False  False    4127584
Name: count, dtype: int64

In [7]:
# List unique values in the Note column
print(df_prod["Note"].unique())

['' 'Unofficial figure']


We can drop some columns we don't need

In [77]:
df_prod = df_prod.drop(columns = 'Area Code (M49)')
df_prod = df_prod.drop(columns = 'Item Code (CPC)')
df_prod = df_prod.drop(columns = 'Year Code')
df_prod = df_prod.drop(columns = 'Flag')
df_prod = df_prod.drop(columns = 'Note')

In [78]:
df_prod = df_prod.rename(columns={'Area Code' : 'area_code', 'Area' : 'area',
                                  'Item Code' : 'item_code', 'Item' : 'item',
                                  'Element Code' : 'element_code', 'Element' : 'element',
                                  'Year' : 'year', 'Unit' : 'unit', 'Value' : 'value'})

In [10]:
df_prod.head()

Unnamed: 0,area_code,area,item_code,item,element_code,element,year,unit,value,flag
0,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1961,ha,0.0,A
1,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1962,ha,0.0,A
2,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1963,ha,0.0,A
3,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1964,ha,0.0,A
4,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1965,ha,0.0,A


In [11]:
# import additional table for AreaCodes (provided by the same download zip)
file_name = 'Production_Crops_Livestock_E_AreaCodes.csv'
df_prod_areacodes = pd.read_csv(
      fdf.get_file_path(file_name, source_dir)
    , encoding='latin-1' # content is not UTF-8 compatible :TODO: :FIXME: ???
)

In [12]:
df_prod_areacodes.head()

Unnamed: 0,Area Code,M49 Code,Area
0,2,'004,Afghanistan
1,5100,'002,Africa
2,3,'008,Albania
3,4,'012,Algeria
4,5200,'019,Americas


In [13]:
# verify no duplicated rows are present
df_prod_areacodes.duplicated(keep='first').sum()

0

In [60]:
# left merge on 'df_prod' to inspect potentially redundant columns
df_prod_temp = df_prod.merge(df_prod_areacodes, how='left', left_on='area_code', right_on='Area Code')

In [17]:
df_prod_temp.head()

Unnamed: 0,area_code,area,item_code,item,element_code,element,year,unit,value,flag,Area Code,M49 Code,Area
0,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1961,ha,0.0,A,2,'004,Afghanistan
1,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1962,ha,0.0,A,2,'004,Afghanistan
2,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1963,ha,0.0,A,2,'004,Afghanistan
3,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1964,ha,0.0,A,2,'004,Afghanistan
4,2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1965,ha,0.0,A,2,'004,Afghanistan


In [61]:
# show divergent area values before and after merge
df_prod_temp[['area', 'Area']][df_prod_temp['area'] != df_prod_temp['Area']].drop_duplicates()

Unnamed: 0,area,Area
559376,"China, Hong Kong SAR",China; Hong Kong SAR
568557,"China, Macao SAR",China; Macao SAR
573362,"China, mainland",China; mainland
606836,"China, Taiwan Province of",China; Taiwan Province of


In [62]:
# compare divergent area values after replacing ';' with ','
df_prod_temp['Area'] = df_prod_temp['Area'].str.replace(';', ',')
df_prod_temp[['area', 'Area']][df_prod_temp['area'] != df_prod_temp['Area']].drop_duplicates()

Unnamed: 0,area,Area


In [79]:
# drop 'area_code' column as it contains no additional info
df_prod = df_prod.drop(columns = ['area_code'])

In [45]:
# import additional table for AreaCodes (provided by the same download zip)
file_name = 'Production_Crops_Livestock_E_ItemCodes.csv'
df_prod_itemcodes = pd.read_csv(
      fdf.get_file_path(file_name, source_dir)
    , encoding='latin-1' # content is not UTF-8 compatible :TODO: :FIXME: ???
)

In [46]:
df_prod_itemcodes.head()

Unnamed: 0,Item Code,CPC Code,Item
0,101,'01195,Canary seed
1,1016,'02123,Goats
2,1017,'21116,Meat of goat; fresh or chilled
3,1018,'21156,Edible offal of goat; fresh; chilled or frozen
4,1019,'21515,Goat fat; unrendered


In [47]:
# verify no duplicated rows are present
df_prod_itemcodes.duplicated(keep='first').sum()

0

In [64]:
# left merge on 'df_prod' to inspect potentially redundant columns
df_prod_temp = df_prod.merge(df_prod_itemcodes, how='left', left_on='item_code', right_on='Item Code')

In [49]:
df_prod_temp.head()

Unnamed: 0,area,item_code,item,element_code,element,year,unit,value,flag,Item Code,CPC Code,Item
0,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1961,ha,0.0,A,221,'01371,Almonds; in shell
1,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1962,ha,0.0,A,221,'01371,Almonds; in shell
2,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1963,ha,0.0,A,221,'01371,Almonds; in shell
3,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1964,ha,0.0,A,221,'01371,Almonds; in shell
4,Afghanistan,221,"Almonds, in shell",5312,Area harvested,1965,ha,0.0,A,221,'01371,Almonds; in shell


In [65]:
# show divergent area values before and after merge
df_prod_temp[['item', 'Item']][df_prod_temp['item'] != df_prod_temp['Item']].drop_duplicates()

Unnamed: 0,item,Item
0,"Almonds, in shell",Almonds; in shell
171,"Anise, badian, coriander, cumin, caraway, fenn...",Anise; badian; coriander; cumin; caraway; fenn...
1074,"Buttermilk, dry",Buttermilk; dry
1396,"Cattle fat, unrendered",Cattle fat; unrendered
1520,"Cheese from milk of goats, fresh or processed",Cheese from milk of goats; fresh or processed
...,...,...
534752,"Cloves (whole stems), raw",Cloves (whole stems); raw
629506,"Agave fibres, raw, n.e.c.",Agave fibres; raw; n.e.c.
683516,"Abaca, manila hemp, raw",Abaca; manila hemp; raw
714451,"Snails, fresh, chilled, frozen, dried, salted ...",Snails; fresh; chilled; frozen; dried; salted ...


In [66]:
# compare divergent Item values after replacing ';' with ','
df_prod_temp['Item'] = df_prod_temp['Item'].str.replace(';', ',')
df_prod_temp[['item', 'Item']][df_prod_temp['Item'] != df_prod_temp['Item']].drop_duplicates()

Unnamed: 0,item,Item


In [67]:
# drop the 'Item Code' column as it contains no additional info other then 'Item'
# needs to be kept for merging with other tables??
# df_prod = df_prod.drop(columns = ['item_code'])	

In [72]:
df_prod[['element_code', 'element', 'unit']].drop_duplicates().sort_values('element')

Unnamed: 0,element_code,element,unit
0,5312,Area harvested,ha
3375,5313,Laying,1000 An
8830,5318,Milk Animals,An
15841,5314,Prod Popultn,No
4367,5321,Producing Animals/Slaughtered,1000 An
1458,5320,Producing Animals/Slaughtered,An
3313,5513,Production,1000 No
109,5510,Production,t
1642,5112,Stocks,1000 An
704,5111,Stocks,An


In [80]:
#element_code stands for a unique combination of element and unit; can be dropped
df_prod = df_prod.drop(columns = ['element_code'])	

In [81]:
df_prod.head()

Unnamed: 0,area,item_code,item,element,year,unit,value,flag
0,Afghanistan,221,"Almonds, in shell",Area harvested,1961,ha,0.0,A
1,Afghanistan,221,"Almonds, in shell",Area harvested,1962,ha,0.0,A
2,Afghanistan,221,"Almonds, in shell",Area harvested,1963,ha,0.0,A
3,Afghanistan,221,"Almonds, in shell",Area harvested,1964,ha,0.0,A
4,Afghanistan,221,"Almonds, in shell",Area harvested,1965,ha,0.0,A


In [92]:
# rearranging columns
df_prod = df_prod[['area', 'year', 'item', 'item_code', 'element', 'unit', 'value']]

In [93]:
df_prod.head()

Unnamed: 0,area,year,item,item_code,element,unit,value
0,Afghanistan,1961,"Almonds, in shell",221,Area harvested,ha,0.0
1,Afghanistan,1962,"Almonds, in shell",221,Area harvested,ha,0.0
2,Afghanistan,1963,"Almonds, in shell",221,Area harvested,ha,0.0
3,Afghanistan,1964,"Almonds, in shell",221,Area harvested,ha,0.0
4,Afghanistan,1965,"Almonds, in shell",221,Area harvested,ha,0.0


In [94]:
fs.write_dataframe(df_prod, 'fao_production_cl_test')

+ table written: fao_production_cl_test
