In [39]:
import pandas as pd

import _functions_sql as fs
import _functions_data_files as fdf

## import from CSV & general overview

In [2]:
file_name = 'Production_Crops_Livestock_E_All_Data_(Normalized).csv'
# define converters for 'read_csv' & import file
conv = {
    'Note': str # handle DtypeWarning not using 'low_memory=False' (deprecated)
                # so missing values are now read as empty string ('')
}
df_prod = pd.read_csv(
      fdf.get_file_path(file_name)
    , encoding='latin-1' # content is not UTF-8 compatible :TODO: :FIXME: ???
    , converters=conv
)

In [3]:
df_prod.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1961,1961,ha,0.0,A,
1,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1962,1962,ha,0.0,A,
2,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1963,1963,ha,0.0,A,
3,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1964,1964,ha,0.0,A,
4,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1965,1965,ha,0.0,A,


In [4]:
df_prod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4127584 entries, 0 to 4127583
Data columns (total 14 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Area Code        int64  
 1   Area Code (M49)  object 
 2   Area             object 
 3   Item Code        int64  
 4   Item Code (CPC)  object 
 5   Item             object 
 6   Element Code     int64  
 7   Element          object 
 8   Year Code        int64  
 9   Year             int64  
 10  Unit             object 
 11  Value            float64
 12  Flag             object 
 13  Note             object 
dtypes: float64(1), int64(5), object(8)
memory usage: 440.9+ MB


In [5]:
df_prod.duplicated().value_counts()

False    4127584
Name: count, dtype: int64

In [6]:
df_prod.isnull().value_counts()

Area Code  Area Code (M49)  Area   Item Code  Item Code (CPC)  Item   Element Code  Element  Year Code  Year   Unit   Value  Flag   Note 
False      False            False  False      False            False  False         False    False      False  False  False  False  False    4127584
Name: count, dtype: int64

## drop columns

### drop 'Area Code' & 'Area Code (M49)' (redundant to 'Area')

In [7]:
# import additional table for AreaCodes (provided by the same download zip)
df_prod_areacodes = pd.read_csv(
      fdf.get_file_path('Production_Crops_Livestock_E_AreaCodes.csv')
    , encoding='latin-1' # content is not UTF-8 compatible :TODO: :FIXME: ???
)

In [8]:
df_prod_areacodes.head()

Unnamed: 0,Area Code,M49 Code,Area
0,2,'004,Afghanistan
1,5100,'002,Africa
2,3,'008,Albania
3,4,'012,Algeria
4,5200,'019,Americas


In [9]:
# verify no duplicated rows are present
df_prod_areacodes.duplicated(keep='first').sum()

0

In [10]:
# rename columns to perfectly match during merge
df_prod_areacodes.rename(columns={
    'M49 Code': 'Area Code (M49)'
}, inplace=True)

In [11]:
# left merge on 'df_prod' to inspect potentially redundant 'Area Code' columns
df_prod_temp = df_prod.merge(df_prod_areacodes, how='left', on='Area')

In [12]:
df_prod_temp.head()

Unnamed: 0,Area Code_x,Area Code (M49)_x,Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note,Area Code_y,Area Code (M49)_y
0,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1961,1961,ha,0.0,A,,2.0,'004
1,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1962,1962,ha,0.0,A,,2.0,'004
2,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1963,1963,ha,0.0,A,,2.0,'004
3,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1964,1964,ha,0.0,A,,2.0,'004
4,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1965,1965,ha,0.0,A,,2.0,'004


In [13]:
# count rows with divergent area codes
df_prod_temp.query(
    '`Area Code_x` != `Area Code_y` \
     or `Area Code (M49)_x` != `Area Code (M49)_y`'
).shape[0]

70130

In [14]:
# count rows with divergent area codes except those, where 'AreaCodes.csv'
# lacks an entry
df_prod_temp.query(
    '`Area Code_x` != `Area Code_y` and `Area Code_y`.notnull() \
     or `Area Code (M49)_x` != `Area Code (M49)_y` and `Area Code (M49)_y`.notnull()'
).shape[0]

0

:TODO: :FIXME: inspect which rows are affected by missing values! only China*?

In [15]:
# drop the 'Area Code' and 'Area Code (M49)' columns as they contain no
# additional info other then 'Area'
df_prod = df_prod.drop(columns = ['Area Code', 'Area Code (M49)'])	

### drop 'Item Code' & 'Item Code (CPC)' (redundant to 'Item')

In [16]:
# import additional table for ItemCodes (provided by the same download zip)
df_prod_itemcodes = pd.read_csv(
      fdf.get_file_path('Production_Crops_Livestock_E_ItemCodes.csv')
    , encoding='latin-1' # content is not UTF-8 compatible :TODO: :FIXME: ???
)

In [17]:
df_prod_itemcodes.head()

Unnamed: 0,Item Code,CPC Code,Item
0,101,'01195,Canary seed
1,1016,'02123,Goats
2,1017,'21116,Meat of goat; fresh or chilled
3,1018,'21156,Edible offal of goat; fresh; chilled or frozen
4,1019,'21515,Goat fat; unrendered


In [18]:
# verify no duplicated rows are present
df_prod_itemcodes.duplicated(keep='first').sum()

0

In [19]:
# rename columns to perfectly match during merge
df_prod_itemcodes.rename(columns={
    'CPC Code': 'Item Code (CPC)'
}, inplace=True)

In [20]:
# left merge on 'df_prod' to inspect potentially redundant 'Item Code' columns
df_prod_temp = df_prod.merge(df_prod_itemcodes, how='left', on='Item')

In [21]:
df_prod_temp.head()

Unnamed: 0,Area,Item Code_x,Item Code (CPC)_x,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note,Item Code_y,Item Code (CPC)_y
0,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1961,1961,ha,0.0,A,,,
1,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1962,1962,ha,0.0,A,,,
2,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1963,1963,ha,0.0,A,,,
3,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1964,1964,ha,0.0,A,,,
4,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1965,1965,ha,0.0,A,,,


In [22]:
# count rows with divergent item codes
df_prod_temp.query(
    '`Item Code_x` != `Item Code_y` \
     or `Item Code (CPC)_x` != `Item Code (CPC)_y`'
).shape[0]

1880725

In [23]:
# count rows with divergent item codes except those, where 'ItemCodes.csv'
# lacks an entry
df_prod_temp.query(
    '`Item Code_x` != `Item Code_y` and `Item Code_y`.notnull() \
     or `Item Code (CPC)_x` != `Item Code (CPC)_y` and `Item Code (CPC)_y`.notnull()'
).shape[0]

0

:TODO: :FIXME: inspect which rows are affected by missing values!

In [24]:
# drop the 'Item Code' and 'Item Code (CPC)' columns as they contain no
# additional info other then 'Item'
df_prod = df_prod.drop(columns = ['Item Code', 'Item Code (CPC)'])	

### drop 'Element Code' (redundant to 'Element')

In [25]:
df_prod[['Element Code', 'Element']].drop_duplicates()

Unnamed: 0,Element Code,Element
0,5312,Area harvested
62,5419,Yield
109,5510,Production
704,5111,Stocks
1458,5320,Producing Animals/Slaughtered
1642,5112,Stocks
3127,5410,Yield
3189,5413,Yield
3313,5513,Production
3375,5313,Laying


In [26]:
df_prod[['Element Code']].drop_duplicates().shape[0]

18

In [27]:
df_prod[['Element']].drop_duplicates().shape[0]

9

:FIXME: :TODO: can not be dropped, since not equals...

### drop 'Year Code' column (redundant to 'Year') 

In [28]:
df_prod.query('`Year Code` != `Year`').shape[0]

0

In [29]:
# drop the 'Year Code' column as it contains the same values as 'Year'
df_prod = df_prod.drop(columns = 'Year Code')

### drop 'Note' column (no value)

In [30]:
df_prod['Note'].unique()

array(['', 'Unofficial figure'], dtype=object)

In [31]:
# drop the 'Note' column as it does not contain relevant info
df_prod = df_prod.drop(columns = 'Note')

## data wrangling

### rename columns

In [None]:
# Rename remaining columns
df_prod.rename(columns={
      'Area': 'area'
    , 'Item': 'item'
    , 'Element Code': 'element_code'
    , 'Element': 'element'
    , 'Year': 'year'
    , 'Unit': 'unit'
    , 'Value': 'value'
    , 'Flag': 'flag'
}, inplace=True)

### verify, column 'Unit' has comparable units of measurement

In [32]:
df_prod[['Unit']].drop_duplicates()

Unnamed: 0,Unit
0,ha
62,100 g/ha
109,t
704,An
1642,1000 An
3127,100 mg/An
3189,No/An
3313,1000 No
3871,100 g/An
4243,0.1 g/An


:FIXME: :TODO: comparison for the following needed: 'An' + '1000 An', '100mg/An' + '100g/An' + '0.1g/An', 'No' + '1000 No'

### add 'Flag' values from additional table

:FIXME: :TODO: tbd!!!!!

# final output

In [34]:
df_prod.head()

Unnamed: 0,area,item,element_code,element,year,unit,value,flag
0,Afghanistan,"Almonds, in shell",5312,Area harvested,1961,ha,0.0,A
1,Afghanistan,"Almonds, in shell",5312,Area harvested,1962,ha,0.0,A
2,Afghanistan,"Almonds, in shell",5312,Area harvested,1963,ha,0.0,A
3,Afghanistan,"Almonds, in shell",5312,Area harvested,1964,ha,0.0,A
4,Afghanistan,"Almonds, in shell",5312,Area harvested,1965,ha,0.0,A


In [35]:
df_prod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4127584 entries, 0 to 4127583
Data columns (total 8 columns):
 #   Column        Dtype  
---  ------        -----  
 0   area          object 
 1   item          object 
 2   element_code  int64  
 3   element       object 
 4   year          int64  
 5   unit          object 
 6   value         float64
 7   flag          object 
dtypes: float64(1), int64(2), object(5)
memory usage: 251.9+ MB


# hmmmm

Exploring the element column, we see that not all values are relevant for further analysis. According to element definitions, yield is the production per unit of harvested area 

In [36]:
#df_prod["Element"].value_counts()

# end

In [37]:
# initialize list of lists
data = [['tom', 10], ['nick', 15], ['juli', 14]]
 
# Create the pandas DataFrame
df_prod = pd.DataFrame(data, columns=['Name', 'Age'])
 
# print dataframe.
print(df_prod)

   Name  Age
0   tom   10
1  nick   15
2  juli   14


In [38]:

# Write records stored in a dataframe to SQL database

fs.write_dataframe(df_prod, 'sh_test2')


+ table written: sh_test2
