In [1]:
### INITIALIZATION

import pandas as pd # Data structures
import numpy as np # Matrix algebra
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters # Plotting
register_matplotlib_converters() # Plotting
import seaborn as sns # Plotting
import gc # Garbage collection

In [2]:
### VERSIONS CONTROL

from platform import python_version
print('python version: ', python_version())
print('numpy version: ', np.__version__)
print('pandas version: ', pd.__version__)

python version:  3.7.4
numpy version:  1.17.2
pandas version:  0.25.3


In [3]:
### MAIN CONSTANTS

### MultiIndex level slice constant:
All = slice(None)
### NA for MS Excel files:
list_na_excel_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null',
                        '#N/A Requesting Data...', '#N/A Invalid Security', '#N/A Field Not Applicable', '---']
### Bilateral export commodities flow dataset path:
str_exp_comm_bilateral = 'Data_Files/Source_Files/commodities_export_bilateral.h5'
### World level export commodities flow dataset path:
str_exp_comm_world = 'Data_Files/Source_Files/commodities_export_world.h5'
### Bilateral export goods and services flow dataset path:
str_exp_unc_bilateral = 'Data_Files/Source_Files/comtrade_export_bilateral.h5'
### World level export goods and services flow dataset path:
str_exp_unc_world = 'Data_Files/Source_Files/comtrade_export_world.h5'
### Universal key:
str_export_key = 'export_augmented'
### Bilateral export commodities flow dataset path:
str_imp_comm_bilateral = 'Data_Files/Source_Files/commodities_import_bilateral.h5'
### World level export commodities flow dataset path:
str_imp_comm_world = 'Data_Files/Source_Files/commodities_import_world.h5'
### Universal key:
str_import_key = 'import_augmented'
### Industries mapping weights:
str_wiot_map = 'Data_Files/Source_Files/wiot_test.xlsx'

In [4]:
### EXAMPLE: FULL DATASET LOADING

pd.read_hdf(path_or_buf = str_exp_comm_world, key = str_export_key)

Date        Reporter  Commodity_ID  Commodity_Name
1988-12-30  AE        0102          Cattle                     NaN
                      0103          Swine                      NaN
                      0901          Coffee                     NaN
                      1001          Wheat                      NaN
                      1005          Corn                       NaN
                                                          ...     
2021-12-31  ZM        75            Nickel            6.112643e+04
                      76            Aluminum          1.509262e+06
                      78            Lead              1.012735e+07
                      79            Zinc              3.345655e+04
                      80            Tin                        NaN
Length: 75140, dtype: float64

In [5]:
### EXAMPLE: FIRST ROWS LOADING

pd.read_hdf(path_or_buf = str_exp_comm_bilateral, key = str_export_key, stop = 1000)

Date        Reporter  Partner  Commodity_ID  Commodity_Name
2017-12-29  AE        AR       0102          Cattle              2722.0
2011-12-30  AE        AR       0901          Coffee                16.0
2020-12-31  AE        AR       1001          Wheat             113434.0
2008-12-31  AE        AR       1005          Corn               38641.0
2016-12-30  AE        AR       1005          Corn              266470.0
                                                                 ...   
2017-12-29  AE        BD       80            Tin                  544.0
2018-12-31  AE        BD       80            Tin                39579.0
2019-12-31  AE        BD       80            Tin                62069.0
2020-12-31  AE        BD       80            Tin                 5963.0
2015-12-31  AE        BE       0102          Cattle               171.0
Name: Export_Augmented, Length: 1000, dtype: float64

In [6]:
### EXAMPLE: FILTERING BY REQUEST

pd.read_hdf(path_or_buf = str_exp_unc_bilateral, key = str_export_key, where = "(Reporter = 'IL') & (Partner = 'US') & (GICS_ID = '3020')")

Date        Reporter  Partner  Type   Commodity_ID  GICS_ID
1991-12-31  IL        US       Goods  01            3020         1602.794678
1992-12-31  IL        US       Goods  01            3020       117262.156250
1993-12-31  IL        US       Goods  01            3020       156527.734375
1995-12-29  IL        US       Goods  01            3020        74000.000000
1996-12-31  IL        US       Goods  01            3020        44000.000000
                                                                   ...      
2016-12-30  IL        US       Goods  24            3020         2000.000000
2018-12-31  IL        US       Goods  24            3020         9000.000000
2019-12-31  IL        US       Goods  24            3020       359000.000000
2020-12-31  IL        US       Goods  24            3020         5000.000000
2021-12-31  IL        US       Goods  24            3020         2000.000000
Name: Export_Augmented, Length: 634, dtype: float32

In [7]:
### EXAMPLE: FILTERING BY REQUEST

pd.read_hdf(path_or_buf = str_exp_unc_world, key = str_export_key, where = "(Reporter = 'US') & (Commodity_ID in ['01', '03'])")

Date        Reporter  Type   Commodity_ID  GICS_ID
1988-12-30  US        Goods  01            3020       8.244971e+07
                             03            3020       1.861761e+09
1989-12-29  US        Goods  01            3020       1.929770e+08
                             03            3020       1.785206e+09
1990-12-31  US        Goods  01            3020       2.578186e+08
                                                          ...     
2019-12-31  US        Goods  03            3020       4.631737e+09
2020-12-31  US        Goods  01            3020       9.639166e+08
                             03            3020       3.812107e+09
2021-12-31  US        Goods  01            3020       1.277562e+09
                             03            3020       4.613681e+09
Name: Export_Augmented, Length: 68, dtype: float64

In [12]:
### PARTNER INDUSTRIES MAPPING

gc.collect()
### Series container:
list_wiot = []
### WIOT Industry Export map loading:
ser_wiot_map = pd.read_excel(engine = 'openpyxl', io = str_wiot_map, sheet_name = 0, index_col = 0, header = 0, dtype = 'float16', 
                            na_values = list_na_excel_values, keep_default_na = False).stack().squeeze()
ser_wiot_map = ser_wiot_map[ser_wiot_map > 0.0]
ser_wiot_map.index.names = ['GICS_ID', 'Partner_Industry']
ser_wiot_map.name = 'Weight'
df_wiot_map = ser_wiot_map.reset_index().astype({'GICS_ID': str, 'Partner_Industry': str}).astype({'Partner_Industry': 'category'}).set_index('GICS_ID')
### Bilateral UN Comtrade dataset loading:
for ser_iter_export in pd.read_hdf(path_or_buf = str_exp_unc_bilateral, key = str_export_key, chunksize = 1000000):
    ### Map joining:
    df_iter_wiot = ser_iter_export.to_frame().join(df_wiot_map, on = 'GICS_ID', how = 'left').set_index('Partner_Industry', append = True)
    ### Industry flow calculation:
    list_wiot.append((df_iter_wiot['Export_Augmented'] * df_iter_wiot['Weight']).dropna())
#    break
### Dataset concatenation:
ser_wiot_matrix = pd.concat(list_wiot, axis = 0)

In [13]:
### TEMP

ser_wiot_matrix

Date        Reporter  Partner  Type   Commodity_ID  GICS_ID  Partner_Industry
1999-12-31  AE        AR       Goods  01            3020     3020                 1852.656006
                                                             3030                 5560.229980
                                                             3510                 9265.541992
                                                             3520                 1852.656006
2007-12-31  AE        AR       Goods  01            3020     3020                   10.897339
                                                                                     ...     
2020-12-31  ZM        ZA       Goods  97            2520     3010                10725.581055
2019-12-31  ZM        ZM       Goods  90            4520     4510                    2.145763
                                                             4520                    6.439909
                                                             4530           

In [15]:
### EXAMPLE: FULL DATASET LOADING

pd.read_hdf(path_or_buf = str_imp_comm_world, key = str_import_key)

Date        Reporter  Commodity_ID  Commodity_Name
1988-12-30  AE        0102          Cattle            2.999561e+05
                      0103          Swine                      NaN
                      0901          Coffee            9.176391e+05
                      1001          Wheat             4.145738e+07
                      1005          Corn              9.899203e+03
                                                          ...     
2021-12-31  ZM        75            Nickel            1.810830e+04
                      76            Aluminum          1.544489e+07
                      78            Lead              1.106022e+06
                      79            Zinc              1.504284e+05
                      80            Tin               1.165110e+04
Length: 75140, dtype: float64

In [14]:
### EXAMPLE: FIRST ROWS LOADING

pd.read_hdf(path_or_buf = str_imp_comm_bilateral, key = str_import_key, stop = 1000)

Date        Reporter  Partner  Commodity_ID  Commodity_Name
2001-12-31  AE        AR       0901          Coffee            1.883400e+04
2010-12-31  AE        AR       0901          Coffee            2.915000e+03
2013-12-31  AE        AR       0901          Coffee            3.259300e+04
2014-12-31  AE        AR       0901          Coffee            7.701600e+04
2015-12-31  AE        AR       0901          Coffee            9.143600e+04
                                                                   ...     
1995-12-29  AE        AU       52            Cotton            7.075176e+05
1996-12-31  AE        AU       52            Cotton            1.833074e+06
1997-12-31  AE        AU       52            Cotton            1.259820e+06
1998-12-31  AE        AU       52            Cotton            4.166808e+04
1999-12-31  AE        AU       52            Cotton            2.183026e+04
Name: Import_Augmented, Length: 1000, dtype: float64

In [16]:
### TEMP

ser_exp_world = pd.read_hdf(path_or_buf = str_exp_comm_world, key = str_export_key)
ser_imp_world = pd.read_hdf(path_or_buf = str_imp_comm_world, key = str_import_key)

In [22]:
### TEMP

pd.concat([ser_exp_world, ser_imp_world], axis = 0, keys = ['Export', 'Import'], names = ['Flow'])\
  .to_hdf('Data_Files/Source_Files/commodities_trade_world.h5', key = 'flows_augmented', mode = 'w', format = 'table', complevel = 9)