In [1]:
### WIOT MATRICES CONVERTATION

In [2]:
### INITIALIZATION

import pandas as pd
import numpy as np
import os
import gc
#from pandarallel import pandarallel
import pandarallel

In [3]:
### RUN EVERY TIME: VERSION CONTROL

from platform import python_version
print('python version: ', python_version())
print('numpy version: ', np.__version__)
print('pandas version: ', pd.__version__)

python version:  3.7.4
numpy version:  1.21.5
pandas version:  1.3.5


In [4]:
### PARAMETERS

### MultiIndex level slice constant:
All = slice(None)
### NA for MS Excel files:
list_na_excel_values = ['', '---', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null',
                        '#N/A Requesting Data...', '#N/A Invalid Security', '#N/A Field Not Applicable']
### Product / Industry mapping path:
str_path_matrix_map = 'Data_Files/Source_Files/WIOT_mapping_detailed.xlsx'
str_sheet_matrix = 'Matrix to Load'
str_sheet_unc_g_map = 'HS'
str_sheet_unc_s_map = 'EBOPS 2010'
### Path to original WIOT Tables:
str_path_wiot_source = 'Data_Files/Source_Files/WIOT'
### Augmented bilateral export:
str_path_export_bilateral = 'Data_Files/Source_Files/comtrade_export_bilateral.h5'
str_key_unc_export = 'export_augmented'
### Downloaded and aggregated shares:
str_path_wiot_volumes_hdf = 'Data_Files/Result_Files/wiot_volumes.h5'
str_path_wiot_shares_hdf = 'Data_Files/Result_Files/wiot_shares.h5'
str_path_wiot_filled_hdf = 'Data_Files/Result_Files/wiot_filled.h5'
str_path_unc_sub_weights_hdf = 'Data_Files/Result_Files/unc_sub_weights.h5'
str_path_unc_ind_weights_hdf = 'Data_Files/Result_Files/unc_ind_weights.h5'
str_path_unc_sub_weights_agg_hdf = 'Data_Files/Result_Files/unc_sub_weights_agg.h5'
str_path_unc_ind_weights_agg_hdf = 'Data_Files/Result_Files/unc_ind_weights_agg.h5'
str_path_unc_sub_weights_full_hdf = 'Data_Files/Result_Files/unc_sub_weights_full.h5'
str_path_unc_ind_weights_full_hdf = 'Data_Files/Result_Files/unc_ind_weights_full.h5'
str_gics_key = 'gics_io'

In [5]:
### COMMODITIES MAPPING TO GICS INDUSTRY GROUPS

### Goods:
ser_goods_to_groups = pd.read_excel(engine = 'openpyxl', io = str_path_matrix_map, sheet_name = str_sheet_unc_g_map, dtype = str, header = [0], 
                               index_col = None).set_index('Commodity_ID')['GICS_Group'].dropna()
ser_goods_to_groups.name = 'Commodity_Group_Code'
### Services:
ser_services_to_groups = pd.read_excel(engine = 'openpyxl', io = str_path_matrix_map, sheet_name = str_sheet_unc_s_map, dtype = str, header = [0], 
                               index_col = None).set_index('Commodity_ID')['GICS_Group'].dropna()
ser_services_to_groups.name = 'Commodity_Group_Code'
ser_services_to_groups = ser_services_to_groups[ser_services_to_groups != '---']
### Mappers concatenation:
ser_comm_to_groups = pd.concat([ser_goods_to_groups, ser_services_to_groups])

In [6]:
### UN COMTRADE TO GICS WIOT BASED MATRIX LOADING

### Source table loading:
df_unc_to_gics = pd.read_excel(engine = 'openpyxl', io = str_path_matrix_map, sheet_name = str_sheet_matrix, dtype = str, header = list(range(6)), 
                               index_col = list(range(4)))
df_unc_to_gics.index.names = ['WIOT_Exporter_Code', 'WIOT_Description', 'Commodity_ID', 'Commodity_Description']
df_unc_to_gics.columns.names = ['WIOT_Importer_Code', 'WIOT_Description', 'GICS_Sub_Code', 'GICS_Industry_Code', 'GICS_Group_Code', 'GICS_Sub_Name']
### Index levels checker:
df_unc_to_gics.index = df_unc_to_gics.index.set_levels(df_unc_to_gics.index.levels[2].astype('str'), level = 'Commodity_ID')
### Matrix filtering:
df_unc_to_gics = df_unc_to_gics.droplevel(['WIOT_Description', 'Commodity_Description']).droplevel(['WIOT_Description', 'GICS_Sub_Name'], axis = 1)
df_unc_to_gics = df_unc_to_gics.drop(index = '---', level = 'Commodity_ID').drop(columns = '---', level = 'GICS_Group_Code')
### Matrix convertation:
df_unc_to_gics = df_unc_to_gics.replace({'x': False, 'y': True, 'z': True})
gc.collect()
ser_unc_to_gics = df_unc_to_gics.stack(df_unc_to_gics.columns.names).astype(bool)
ser_unc_to_gics.name = 'Connection_Flag'
### Adding Comtrade to GICS mapping:
ser_unc_to_gics = ser_unc_to_gics.to_frame().join(ser_comm_to_groups, how = 'inner').set_index('Commodity_Group_Code', append = True).squeeze()
### Index Levels categorizing:
ser_unc_to_gics = ser_unc_to_gics.reorder_levels([0, 2, 1, 6, 3, 4, 5]).sort_index()
ser_unc_to_gics.index = ser_unc_to_gics.index.set_levels(ser_unc_to_gics.index.levels[4].astype(str), level = 'GICS_Sub_Code')
ser_unc_to_gics.index = ser_unc_to_gics.index.set_levels(ser_unc_to_gics.index.levels[0].astype('category'), level = 'WIOT_Exporter_Code')
ser_unc_to_gics.index = ser_unc_to_gics.index.set_levels(ser_unc_to_gics.index.levels[1].astype('category'), level = 'WIOT_Importer_Code')
ser_unc_to_gics.index = ser_unc_to_gics.index.set_levels(ser_unc_to_gics.index.levels[2].astype('category'), level = 'Commodity_ID')
ser_unc_to_gics.index = ser_unc_to_gics.index.set_levels(ser_unc_to_gics.index.levels[3].astype('category'), level = 'Commodity_Group_Code')
ser_unc_to_gics.index = ser_unc_to_gics.index.set_levels(ser_unc_to_gics.index.levels[4].astype('category'), level = 'GICS_Sub_Code')
ser_unc_to_gics.index = ser_unc_to_gics.index.set_levels(ser_unc_to_gics.index.levels[5].astype('category'), level = 'GICS_Industry_Code')
ser_unc_to_gics.index = ser_unc_to_gics.index.set_levels(ser_unc_to_gics.index.levels[6].astype('category'), level = 'GICS_Group_Code')
gc.collect()

  warn(msg)


18

In [7]:
### DEFINING COUNTRY CODES EXTRACTOR

def get_country_codes(use_local_copy = False):  
    ### In case if URL is unavailable:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/countrycode.html'
    ### Online extraction:
    else:
        url_country_code = 'https://countrycode.org/'
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    df_result = df_full_codes[['ISO SHORT', 'ISO LONG']].sort_index()    
    df_result.index = df_result.index.str.upper()
    ### Results output:
    return df_result

### World Country Codes:
df_country_codes = get_country_codes()
dict_ison_mapper = df_country_codes.set_index('ISO LONG').squeeze().to_dict()
dict_ison_mapper['ROW'] = 'YY'

In [None]:
### WIOT DATA TABLES LOADING & CONVERTATION & SAVING

### Lists of WIOT Activities:
list_exporter_codes = ['r' + str(iter_num) for iter_num in range(1, 55)]
list_importer_codes = ['c' + str(iter_num) for iter_num in range(1, 55)]
#list_importer_codes.remove('c51') ### Remove of "Public administration and defence; compulsory social security"

if (os.path.exists(str_path_wiot_volumes_hdf)):
    os.remove(str_path_wiot_volumes_hdf)
#for year_matrix_csv in os.listdir(str_path_wiot_source):
for year_matrix_csv in os.listdir(str_path_wiot_source)[::-1]:    
    gc.collect()
    dt_year_end = (pd.to_datetime(year_matrix_csv[4 : 8]) + pd.offsets.BYearEnd())#.date()
    str_year_num = year_matrix_csv[4 : 8]
    print(str_year_num, ': IO Matrix of WIOT Activities Loading Started')
    df_year_raw = pd.read_csv(str_path_wiot_source + '/' + year_matrix_csv, sep = ';', index_col = [0, 1, 2, 3], skiprows = [0, 1, 2, 3], header = [0, 1], 
                                        na_values = list_na_excel_values, keep_default_na = False)
    ser_year_raw = df_year_raw.droplevel([0, 1]).stack([0, 1]).astype('int32')
    ser_year_raw.name = 'Value'
    ser_year_raw.index.names = ['Supplier_Country_Long', 'WIOT_Exporter_Code', 'User_Country_Long', 'WIOT_Importer_Code']    
    df_year_typed = ser_year_raw.reset_index()
    del df_year_raw
    del ser_year_raw
    gc.collect()
    df_year_typed = df_year_typed[df_year_typed['Supplier_Country_Long'].isin(df_country_codes['ISO LONG'].to_list() + ['ROW']) & 
                                  df_year_typed['User_Country_Long'].isin(df_country_codes['ISO LONG'].to_list() + ['ROW']) & 
                                  df_year_typed['WIOT_Exporter_Code'].isin(list_exporter_codes) & 
                                  df_year_typed['WIOT_Importer_Code'].isin(list_importer_codes)]
    df_year_typed['Exporter'] = df_year_typed['Supplier_Country_Long'].replace(dict_ison_mapper)
    df_year_typed['Importer'] = df_year_typed['User_Country_Long'].replace(dict_ison_mapper)          
    df_year_typed['Exporter'] = df_year_typed['Exporter'].astype('category')
    df_year_typed['Importer'] = df_year_typed['Importer'].astype('category')
    df_year_typed['WIOT_Exporter_Code'] = df_year_typed['WIOT_Exporter_Code'].astype('category')
    df_year_typed['WIOT_Exporter_Code'] = df_year_typed['WIOT_Exporter_Code'].cat.set_categories(new_categories = list_exporter_codes, ordered = True)
    df_year_typed['WIOT_Importer_Code'] = df_year_typed['WIOT_Importer_Code'].astype('category')    
    df_year_typed['WIOT_Importer_Code'] = df_year_typed['WIOT_Importer_Code'].cat.set_categories(new_categories = list_importer_codes, ordered = True)    
    ser_year_wiot = df_year_typed.set_index(['Exporter', 'Importer', 'WIOT_Exporter_Code', 'WIOT_Importer_Code'])['Value'].squeeze().sort_index()
    ser_year_wiot = ser_year_wiot.clip(lower = 0)
    pd.concat([ser_year_wiot], keys = [dt_year_end], names = ['Date'])\
                                                    .to_hdf(str_path_wiot_volumes_hdf, key = str_gics_key, mode = 'a', format = 'table', complevel = 9, append = True)
    print(str_year_num, ': IO Matrix of WIOT Activities Saved')    
    break

In [None]:
### WIOT LAST DATE MATRIX CONVERTING TO SHARES & AGGREGATED DATA ADDING

gc.collect()
ser_last_wiot = pd.read_hdf(str_path_wiot_volumes_hdf, key = str_gics_key).droplevel('Date')

### Adding World as Exporter & Importer
df_last_wiot = ser_last_wiot.reset_index()
df_last_wiot['Exporter'] = df_last_wiot['Exporter'].cat.add_categories(['WW'])
df_last_wiot['Importer'] = df_last_wiot['Importer'].cat.add_categories(['WW'])    
### Inner flows filtering:
df_inner_flows = df_last_wiot[(df_last_wiot['Exporter'] == df_last_wiot['Importer'])]      
### Outer flows filtering:
df_outer_flows = df_last_wiot[(df_last_wiot['Exporter'] != df_last_wiot['Importer'])]  
### Inner values to series:
ser_country_inner_values = df_inner_flows.set_index(['Exporter', 'Importer', 'WIOT_Exporter_Code', 'WIOT_Importer_Code']).astype('float32').squeeze()     
### Outer values to series:    
ser_country_to_country_values = df_outer_flows.set_index(['Exporter', 'Importer', 'WIOT_Exporter_Code', 'WIOT_Importer_Code']).astype('float32').squeeze() 
### Country to World trade:        
ser_country_to_world_values = df_outer_flows.groupby(['Exporter', 'WIOT_Exporter_Code', 'WIOT_Importer_Code'], observed = True)['Value'].sum().squeeze().sort_index()
### World to World trade:            
ser_world_to_world_values = ser_country_to_world_values.groupby(['WIOT_Exporter_Code', 'WIOT_Importer_Code'], observed = True).sum().astype('float32').sort_index()
### World to Country trade:                
ser_world_to_country_values = df_outer_flows.groupby(['Importer', 'WIOT_Exporter_Code', 'WIOT_Importer_Code'], observed = True)['Value'].sum().squeeze().sort_index()
### Inner shares:
ser_country_inner_shares = ser_country_inner_values.groupby(['Exporter', 'Importer', 'WIOT_Exporter_Code'], group_keys = False, observed  = True)\
                                                   .apply(lambda ser_group: ser_group / ser_group.sum()).astype('float32').sort_index()     
### Outer shares:
ser_country_to_country_shares = ser_country_to_country_values.groupby(['Exporter', 'Importer', 'WIOT_Exporter_Code'], group_keys = False, observed  = True)\
                                                             .apply(lambda ser_group: ser_group / ser_group.sum()).astype('float32').sort_index()          
ser_country_to_world_shares = ser_country_to_world_values.groupby(['Exporter', 'WIOT_Exporter_Code'], group_keys = False, observed  = True)\
                                                         .apply(lambda ser_group: ser_group / ser_group.sum()).astype('float32').sort_index() 
ser_country_to_world_shares = pd.concat({'WW': ser_country_to_world_shares}, names = ['Importer']).swaplevel(0, 1).sort_index()
#ser_country_to_world_shares.drop(('WW', 'WW'), inplace = True)
ser_world_to_country_shares = ser_world_to_country_values.groupby(['Importer', 'WIOT_Exporter_Code'], group_keys = False, observed  = True)\
                                                         .apply(lambda ser_group: ser_group / ser_group.sum()).astype('float32').sort_index() 
ser_world_to_country_shares = pd.concat({'WW': ser_world_to_country_shares}, names = ['Exporter'])    
#ser_world_to_country_shares.drop(('WW', 'WW'), inplace = True)    
ser_world_to_world_shares = ser_world_to_world_values.groupby(['WIOT_Exporter_Code'], group_keys = False, observed  = True)\
                                                     .apply(lambda ser_group: ser_group / ser_group.sum()).astype('float32').sort_index() 
ser_world_to_world_shares = pd.concat({'WW': pd.concat({'WW': ser_world_to_world_shares}, names = ['Importer'])}, names = ['Exporter'])    
### Values aggregating:
ser_last_shares = pd.concat([ser_country_inner_shares, ser_country_to_country_shares, ser_country_to_world_shares, ser_world_to_country_shares, 
                             ser_world_to_world_shares]).sort_index()
del ser_country_to_country_shares
gc.collect()
#ser_last_shares.drop('9999', axis = 0, level = 'WIOT_Exporter_Code', inplace =  True)
#ser_last_shares.drop('9999', axis = 0, level = 'WIOT_Importer_Code', inplace =  True) 
ser_last_shares.name = 'Share'
ser_last_shares.to_hdf(str_path_wiot_shares_hdf, key = str_gics_key, mode = 'w', format = 'table', complevel = 9)

In [30]:
### FILLING GAPS IN SHARES

gc.collect()

ser_last_shares = pd.read_hdf(str_path_wiot_shares_hdf, key = str_gics_key).fillna(0.0)

dict_counter = {}
dict_counter['country_to_country'] = 0
dict_counter['country_to_world'] = 0
dict_counter['world_to_country'] = 0
dict_counter['world_to_world'] = 0
dict_counter['empty'] = 0

for iter_exporter in ser_last_shares.index.levels[0]:
    for iter_importer in ser_last_shares.index.levels[1]:
        for iter_industry in ser_last_shares.index.levels[2]:
            ser_country_to_country = ser_last_shares.loc[iter_exporter, iter_importer, iter_industry]
            ser_country_to_world = ser_last_shares.loc[iter_exporter, 'WW', iter_industry]
            ser_world_to_country = ser_last_shares.loc['WW', iter_importer, iter_industry]
            ser_world_to_world = ser_last_shares.loc['WW', 'WW', iter_industry]
            if (ser_country_to_country.sum() == 0.0):
                if (ser_country_to_world.sum() == 0.0):
                    if (ser_world_to_country.sum() == 0.0):
                        if (ser_world_to_world.sum() == 0.0):
                            dict_counter['empty'] += 1                    
#                            print(iter_exporter, iter_importer, iter_industry, ': no data to fill')                            
                        else:
                            ser_last_shares.loc[iter_exporter, iter_importer, iter_industry] = ser_world_to_world.values
                            dict_counter['world_to_world'] += 1
#                            print(iter_exporter, iter_importer, iter_industry, ': filled by World to World level data')                            
                    else:
                        ser_last_shares.loc[iter_exporter, iter_importer, iter_industry] = ser_world_to_country.values
                        dict_counter['world_to_country'] += 1
#                        print(iter_exporter, iter_importer, iter_industry, ': filled by World to Country level data')                        
                else:
                    ser_last_shares.loc[iter_exporter, iter_importer, iter_industry] = ser_country_to_world.values
                    dict_counter['country_to_world'] += 1
#                    print(iter_exporter, iter_importer, iter_industry, ': filled by Country to World level data')
            else:
                dict_counter['country_to_country'] += 1
print(dict_counter)
### Results saving:
ser_last_shares.to_hdf(str_path_wiot_filled_hdf, key = str_gics_key, mode = 'w', format = 'table', complevel = 9)

{'country_to_country': 52892, 'country_to_world': 48181, 'world_to_country': 7807, 'world_to_world': 470, 'empty': 0}


In [13]:
### TEST

for iter_option in dict_counter:
    print(iter_option, ':', round(dict_counter[iter_option] / len(ser_last_shares.groupby(['Exporter', 'Importer', 'WIOT_Exporter_Code']).count()), 4))

country_to_country : 0.4837
country_to_world : 0.4406
world_to_country : 0.0714
world_to_world : 0.0043
empty : 0.0


In [7]:
### DEFINING COMMODITY TO SUB INDUSTRY DISTRIBUTION CALCUCATOR

def get_wiot_weights_parallel(ser_wiot_shares, ser_unc_to_gics, ser_iter_comtrade):
    ### Extracting Groupby Keys:
    ser_wiot_shares = ser_wiot_shares.squeeze()
    ser_wiot_shares.name = 'Activity_Share'
#    print(ser_wiot_shares.describe())
    (str_iter_exporter, str_iter_importer, str_iter_code) = ser_wiot_shares.index[0][: -1]
#    print(str_iter_exporter, str_iter_importer, str_iter_code)
    if ((str_iter_importer == 'AT') & (str_iter_code == 'r1')):
        print(str_iter_exporter, 'as Exporter')
    ### Selecting Product -> Importer GICS Sub Industry Flags:
    ser_wiot_flags = ser_unc_to_gics[str_iter_code]
    ### Calculating of Relation Product Export Shares Inside Exporter WIOT Activity:
    idx_comm_id = ser_wiot_flags['c2'].index.get_level_values('Commodity_ID')
    try:
        ser_unc_volumes = ser_iter_comtrade.loc[[str_iter_exporter], [str_iter_importer], idx_comm_id]
    except KeyError:
        print('No trade:', str_iter_exporter, str_iter_importer, str_iter_code)
    else:
        ser_unc_shares = (ser_unc_volumes / ser_unc_volumes.sum()).droplevel(['Exporter', 'Importer'])
        ser_unc_shares.name = 'Export_Share'
        ### Concatenating Importer WIOT Activity Shares & Product -> Importer GICS Sub Industry Flags & Product Export Shares Inside Exporter WIOT Activity:
        df_pair_supply = ser_wiot_shares.droplevel(['Exporter', 'Importer', 'WIOT_Exporter_Code']).to_frame().join(ser_wiot_flags, how = 'left')\
                                                                                                            .join(ser_unc_shares, how = 'left').sort_index()
        ### First Step Weights Distribution: Product Share, Weighted by GICS Sub Industry Share (Activity Share equal part):
        df_pair_supply['Weight_First'] = df_pair_supply.groupby(['WIOT_Importer_Code', 'Commodity_ID'], observed = True, group_keys = False)\
                                        .apply(lambda df_i: df_i['Export_Share'] * df_i['Activity_Share'] * df_i['Connection_Flag'] / df_i['Connection_Flag'].sum())
#        ### Primary Weights Normalization inside Commodity:
#        df_pair_supply['Weight_First'] = df_pair_supply['Weight_First'].groupby('Commodity_ID', observed = True).transform(lambda ser_i: ser_i / ser_i.sum())   
        ### Secondary Weights Distribution: Normalization Inside Export WIOT Activity * Import WIOT Activity Matrix:
        df_pair_supply['Weight_Second'] = df_pair_supply['Weight_First'].groupby('WIOT_Importer_Code', observed = True).apply(lambda ser_i: ser_i / ser_i.sum())\
                                                                                                * df_pair_supply['Activity_Share']    
        ### Secondary Weights Distribution: Normalization inside Commodity:
        df_pair_supply['Weight_Second'] = df_pair_supply['Weight_Second'].groupby('Commodity_ID', observed = True).transform(lambda ser_i: ser_i / ser_i.sum())
        ### Results Output:
#        return df_pair_supply['Weight_Second']        
        ser_result = df_pair_supply.loc[df_pair_supply['Weight_Second'] > 0.0, 'Weight_Second'].astype('float16')
        ser_result = ser_result.groupby(['Commodity_ID', 'Commodity_Group_Code', 'GICS_Sub_Code'], observed = True).sum()
        if (len(ser_result) > 0):
            return ser_result

In [8]:
### DISTRIBUTION CONSTANTS

### Date Range defining:
str_year_start = '1989' # '1989'
str_year_end = '2023'
list_dates = pd.date_range(start = str_year_start, end = str_year_end, freq = 'BY').to_list()
#list_dates = pd.date_range(start = str_year_start, end = str_year_end, freq = '7BY').to_list()
### WIOT Matrix Shares Loading:
ser_last_shares = pd.read_hdf(str_path_wiot_filled_hdf, key = str_gics_key)
list_exporters = ser_last_shares.index.levels[0].to_list()
list_exporters.remove('YY')
list_exporters.remove('WW')
list_importers = ser_last_shares.index.levels[1].to_list()
list_importers.remove('YY')
list_importers.remove('WW')

In [9]:
### UN COMTRADE & WIOT AGGREGATION : ALL POSSIBLE COMBINATIONS

### Deleting old data:
if (os.path.exists(str_path_unc_sub_weights_full_hdf)):
    os.remove(str_path_unc_sub_weights_full_hdf)
### Looping over Comtrade History Years:    
#for iter_date in [list_dates[-2]]:
#for iter_date in list_dates[:-19][::-1]:
for iter_date in list_dates[::-1]:
    gc.collect()
    print(iter_date.date())
    ### UN Comtrade Bilateral Export Flows Extraction:
    ser_iter_comtrade = pd.read_hdf(str_path_export_bilateral, key = str_key_unc_export, where = "Date in [iter_date]").droplevel(['Date', 'Type']).sort_index()
    ser_iter_comtrade.index.names = ['Exporter', 'Importer', 'Commodity_ID']
    ### Filtering Country to Country Export Volumes:
    ser_iter_unc_bilateral = ser_iter_comtrade.loc[list_exporters, list_importers]
    ### Calculation of Country to World Export Volumes:
    ser_iter_unc_to_ww = ser_iter_comtrade.groupby(['Exporter', 'Commodity_ID']).sum()
    ### Calculation of World to Country Export Volumes:                      
    ser_iter_unc_from_ww = ser_iter_comtrade.groupby(['Importer', 'Commodity_ID']).sum()
    ### Calculation of Country to Rest of the World Export Volumes:
    ser_iter_unc_to_wiot = ser_iter_comtrade.loc[:, list_importers, :].groupby(['Exporter', 'Commodity_ID']).sum()
    ser_iter_unc_to_yy = ser_iter_unc_to_ww - ser_iter_unc_to_wiot
    ### Country to Totals datasets aggregation:
    ser_iter_unc_to_add = pd.concat([ser_iter_unc_to_ww, ser_iter_unc_to_yy], keys = ['WW', 'YY'], names = ['Importer'])
    ser_iter_unc_to_add = ser_iter_unc_to_add.reorder_levels([1, 0, 2]).sort_index()    
    ### Calculation of Rest of the World to Country Export Volumes:   
    ser_iter_unc_from_wiot = ser_iter_comtrade.loc[list_exporters, :, :].groupby(['Importer', 'Commodity_ID']).sum()
    ser_iter_unc_from_yy = ser_iter_unc_from_ww - ser_iter_unc_from_wiot
    ### Totals to Country datasets aggregation:
    ser_iter_unc_from_add = pd.concat([ser_iter_unc_from_ww, ser_iter_unc_from_yy], keys = ['WW', 'YY'], names = ['Exporter']).sort_index()
    ### Calculation of World to World Export Volumes:
    ser_iter_ww_to_ww = pd.concat([ser_iter_comtrade.groupby('Commodity_ID').sum()], keys = [('WW', 'WW')], names = ['Exporter', 'Importer'])
    ### Calculation of Rest of the World to Rest of the World Export Volumes:                      
    ser_iter_yy_to_yy = ser_iter_comtrade.loc[~ser_iter_comtrade.index.get_level_values('Exporter').isin(list_exporters)]
    ser_iter_yy_to_yy = ser_iter_yy_to_yy.loc[~ser_iter_yy_to_yy.index.get_level_values('Importer').isin(list_importers)]
    ser_iter_yy_to_yy = pd.concat([ser_iter_yy_to_yy.groupby('Commodity_ID').sum()], keys = [('YY', 'YY')], names = ['Exporter', 'Importer'])  
    ### Calculation of Rest of the World to World Export Volumes:  
    ser_iter_yy_to_ww = ser_iter_comtrade.loc[~ser_iter_comtrade.index.get_level_values('Exporter').isin(list_exporters)]
    ser_iter_yy_to_ww = pd.concat([ser_iter_yy_to_ww.groupby('Commodity_ID').sum()], keys = [('YY', 'WW')], names = ['Exporter', 'Importer'])
    ### Calculation of World to Rest of the World Export Volumes:      
    ser_iter_ww_to_yy = ser_iter_comtrade.loc[~ser_iter_comtrade.index.get_level_values('Importer').isin(list_importers)]
    ser_iter_ww_to_yy = pd.concat([ser_iter_ww_to_yy.groupby('Commodity_ID').sum()], keys = [('WW', 'YY')], names = ['Exporter', 'Importer'])  
    ### Aggregating all needed export flows datasets:    
    ser_iter_unc_agg = pd.concat([ser_iter_unc_bilateral, ser_iter_unc_to_add, ser_iter_unc_from_add, 
                                  ser_iter_yy_to_yy, ser_iter_ww_to_ww, ser_iter_yy_to_ww, ser_iter_ww_to_yy])
    ser_iter_unc_agg = ser_iter_unc_agg.loc[ser_last_shares.index.levels[0].to_list(), ser_last_shares.index.levels[1].to_list()].sort_index()          
    ### Filtering aggregated shares only:
#    ser_agg_shares = pd.concat([ser_last_shares.loc[list_exporters, list_importers],
#                                ser_last_shares.loc[:, ['YY', 'WW']], 
#                                ser_last_shares.loc[['YY', 'WW'], list_importers]], axis = 0).sort_index()
    ### Commodity Distribution Calculation:
    pandarallel.initialize(progress_bar = True)
#    ser_test_shares = ser_last_shares.loc[['AT', 'BE', 'YY', 'WW'], ['AT', 'BE', 'YY', 'WW']]
    ser_wiot_weights = ser_last_shares.to_frame().groupby(['Exporter', 'Importer', 'WIOT_Exporter_Code'], observed = True)\
                            .parallel_apply(get_wiot_weights_parallel, ser_unc_to_gics.droplevel(['GICS_Industry_Code', 'GICS_Group_Code']), ser_iter_unc_agg)\
                            .dropna().astype('float16')  
#    print(len(ser_wiot_weights))
    ### Saving Results to File:
    pd.concat([ser_wiot_weights], keys = [iter_date], names = ['Date'])\
                            .to_hdf(path_or_buf = str_path_unc_sub_weights_full_hdf, key = str_gics_key, mode = 'a', format = 'table', complevel = 9, append = True)
    break    

2022-12-30
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27338), Label(value='0 / 27338')))…

In [21]:
### TEMP

list_ww_ww = []
for iter_date in [list_dates[0], list_dates[-1]]:
    print(iter_date)
    ser_test_agg = pd.read_hdf(path_or_buf = str_path_unc_sub_weights_agg_hdf, where = "Date in [iter_date]").sort_index()
    print(len(ser_test_agg.loc[:, 'WW', 'WW']))
    list_ww_ww.append(ser_test_agg.loc[:, 'WW', 'WW'])

1989-12-29 00:00:00
13269
2022-12-30 00:00:00
13269


In [26]:
### TEMP

(list_ww_ww[1].droplevel('Date') - list_ww_ww[0].droplevel('Date')).abs().sum()

0.0

In [10]:
### TEMP

gc.collect()
ser_test_full = pd.read_hdf(path_or_buf = str_path_unc_sub_weights_full_hdf).sort_index()
ser_test_full.name = 'Share'
ser_test_agg = pd.read_hdf(path_or_buf = str_path_unc_sub_weights_agg_hdf, where = "Date in [iter_date]").sort_index()
#ser_test_agg = ser_test_agg.loc[:, ['AT', 'BE', 'YY', 'WW'], ['AT', 'BE', 'YY', 'WW'], :]
ser_test_agg.name = 'Share'
ser_test_bil = pd.read_hdf(path_or_buf = str_path_unc_sub_weights_hdf, where = "Date in [iter_date]").sort_index()
#ser_test_bil = ser_test_bil.loc[:, ['AT', 'BE', 'YY', 'WW'], ['AT', 'BE', 'YY', 'WW'], :]
ser_test_bil.name = 'Share'
ser_test_old = pd.concat([ser_test_bil, ser_test_agg], axis = 0).sort_index()

In [11]:
### TEMP

ser_test_full.equals(ser_test_old)

True

In [10]:
### WIOT WEIGHTS TO GICS INDUSTRIES DISTRIBUTION

### Deleting old data:
if (os.path.exists(str_path_unc_ind_weights_full_hdf)):
    os.remove(str_path_unc_ind_weights_full_hdf)
### Defining renormalizing distribution to 1 for duplicated commodities (different economic activities):
def renormalize_duplicated_commodities(ser_comm):
    if (ser_comm.sum() != 1.0):
        ser_comm = ser_comm / ser_comm.sum()
    return ser_comm
### Looping over years:
#for iter_date in [list_dates[-2]]:
for iter_date in list_dates[::-1]:
    print(iter_date, ': Aggregating Export Distribution Weights by Importer Industry started')
    gc.collect()
    ### WIOT Weights Loading:
    ser_wiot_weights = pd.read_hdf(path_or_buf = str_path_unc_sub_weights_full_hdf, key = str_gics_key, where = "(Date in [iter_date])").droplevel('Date')
    list_levels = list(ser_wiot_weights.index.names)
    list_levels.remove('WIOT_Exporter_Code')
    ### Aggregating data for each commodity:
    ser_comm_weights = ser_wiot_weights.groupby(list_levels, observed = True).mean().astype('float32')
    del ser_wiot_weights
    gc.collect()
    ### Aggregating data for each commodity that represented in several ecoonomic activities:    
    ser_norm_weights = ser_comm_weights.groupby(['Exporter', 'Importer', 'Commodity_ID', 'Commodity_Group_Code'], observed = True)\
                                       .transform(renormalize_duplicated_commodities)     
    ser_norm_weights.name = 'Share'
    ### Aggregationg from Sub Industry to Industry:
    df_ind_weights = ser_norm_weights.reset_index('GICS_Sub_Code')
    df_ind_weights['GICS_Industry_Code'] = df_ind_weights['GICS_Sub_Code'].str[:6]
    ser_ind_weights = df_ind_weights.set_index('GICS_Industry_Code', append = True)['Share'].sort_index()
    ser_ind_weights = ser_ind_weights.groupby(ser_ind_weights.index.names, observed = True).sum()   
    ### Saving Industry weights:
    pd.concat([ser_ind_weights], keys = [iter_date], names = ['Date'])\
                            .to_hdf(path_or_buf = str_path_unc_ind_weights_full_hdf, key = str_gics_key, mode = 'a', format = 'table', complevel = 9, append = True)
    print(iter_date, ': Aggregating Export Distribution Weights by Importer Industry finished')    
    
    break

2022-12-30 00:00:00 : Aggregating Export Distribution Weights by Importer Industry started
2022-12-30 00:00:00 : Aggregating Export Distribution Weights by Importer Industry finished


In [11]:
### TEMP

gc.collect()
ser_test_full = pd.read_hdf(path_or_buf = str_path_unc_ind_weights_full_hdf).sort_index()
ser_test_full.name = 'Share'
ser_test_agg = pd.read_hdf(path_or_buf = str_path_unc_ind_weights_agg_hdf, where = "Date in [iter_date]").sort_index()
#ser_test_agg = ser_test_agg.loc[:, ['AT', 'BE', 'YY', 'WW'], ['AT', 'BE', 'YY', 'WW'], :]
ser_test_agg.name = 'Share'
ser_test_bil = pd.read_hdf(path_or_buf = str_path_unc_ind_weights_hdf, where = "Date in [iter_date]").sort_index()
#ser_test_bil = ser_test_bil.loc[:, ['AT', 'BE', 'YY', 'WW'], ['AT', 'BE', 'YY', 'WW'], :]
ser_test_bil.name = 'Share'
ser_test_old = pd.concat([ser_test_bil, ser_test_agg], axis = 0).sort_index()

In [12]:
### TEMP

ser_test_full.equals(ser_test_old)

True