In [1]:
### WW TO WW DATA COLLECTION (BASED ON WIOT 2014 SUPPLY / USE MATRICES)

In [2]:
### INITIALIZATION

import pandas as pd
import numpy as np
import os
import gc
import matplotlib.pyplot as plt
from matplotlib.cbook import boxplot_stats ### To annotate fliers
import seaborn as sns

In [3]:
### PARAMETERS

### Date Range defining:
str_year_start = '1994'
### MultiIndex level slice constant:
All = slice(None)
### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### Commodity to Industry shares:
#str_path_unc_ind_weights_hdf = 'Data_Files/Result_Files/unc_ind_weights.h5'
#str_path_unc_ind_weights_agg_hdf = 'Data_Files/Result_Files/unc_ind_weights_agg.h5'
str_path_unc_ind_weights_full_hdf = 'Data_Files/Result_Files/unc_ind_weights_full.h5'
str_gics_key = 'gics_io'
### Product / Industry mapping path:
str_path_matrix_map = 'Data_Files/Source_Files/WIOT_mapping_detailed.xlsx'
str_sheet_matrix = 'Matrix to Load'
str_sheet_gics = 'GICS 2018'
str_sheet_hs = 'HS'
str_sheet_ebops = 'EBOPS 2010'
str_sheet_nace_r = 'WIOT R to COMTRADE'
str_sheet_nace_c = 'WIOT C to GICS'
str_sheet_gics_substitution = 'GICS Substitution'
### Primiary cells path:
str_path_primary = 'Data_Files/Test_Files/WIOT2014_Total_Total.xlsx'
str_sheet_cells = 'shares'
### Augmented bilateral export:
str_path_export_bilateral = 'Data_Files/Source_Files/comtrade_export_bilateral.h5'
str_key_unc_export = 'export_augmented'
### Old way resulting table:
str_path_total_shares = 'Data_Files/Test_Files/total_shares_v2.csv'
### Goods classification (HS1992):
str_hs_url = 'https://comtrade.un.org/Data/cache/classificationH0.json'
### Pathto save World-to-World timeseries:
str_path_ww_to_ww_hdf = 'Data_Files/Result_Files/unc_ind_weights_full.h5'

In [4]:
### OLD VERSION MATRIX

### Loading old version shares:
df_old_version = pd.read_csv(str_path_total_shares, index_col = 0)
df_old_version.index.names = ['Group_Code']
df_old_version.index = df_old_version.index.astype('str')
ser_ww_to_ww_old = df_old_version.stack()
ser_ww_to_ww_old.index.names = ['Commodity_Group_Code', 'GICS_Group_Code']
ser_ww_to_ww_old.name = 'Share_Old'

In [None]:
### DATA PREPARATION

### Industry Groups substitution order loading:
ser_gics_substitution = pd.read_excel(str_path_matrix_map, str_sheet_gics_substitution, dtype = 'str', index_col = None)\
                          .set_index('Industry Group').dropna().squeeze()
ser_gics_substitution.index.names = ['Commodity_Group_Code']
ser_gics_substitution.name = 'Substitute'
### World matrices collection:
dict_ww_ww_by_group = {}
### Looding over years of trade:
for iter_date in pd.date_range('1993', '2022', freq = 'BY'):
#for iter_date in ['2000-12-29', '2020-12-31']:
    gc.collect()
    print(iter_date)
    ### Loading distribution of connodities dataset:
    ser_ww_ww_use_ind = pd.read_hdf(str_path_unc_ind_weights_full_hdf, where = "Date in [iter_date]").droplevel('Date')['WW', 'WW']   
    ### Aggregating to Industry Groups of Customer:
    df_ww_ww_use_ind = ser_ww_ww_use_ind.to_frame().reset_index('GICS_Industry_Code')
    df_ww_ww_use_ind['GICS_Group_Code'] = df_ww_ww_use_ind['GICS_Industry_Code'].str[: 4]
    ser_ww_ww_use_group = df_ww_ww_use_ind.dropna().set_index('GICS_Group_Code', append = True).sort_index()['Share']\
                                        .groupby(['Commodity_ID', 'Commodity_Group_Code', 'GICS_Group_Code'], observed = True).sum()
    ### Comtrade dataset loading:
    ser_comtrade_bil = pd.read_hdf(str_path_export_bilateral, where = "Date in [iter_date]").droplevel(['Date', 'Type'])
    ser_comtrade_bil.index.names = ['Exporter', 'Importer', 'Commodity_ID']
    ser_comtrade_ww_ww = (ser_comtrade_bil.groupby(['Commodity_ID']).sum() / 1000).astype(int)
    ser_ww_ww_by_group = ser_ww_ww_use_group.to_frame().join(ser_comtrade_ww_ww).groupby(['Commodity_Group_Code', 'GICS_Group_Code'])\
                                            .apply(lambda df_group: (df_group['Share'] * df_group['Export']).sum() / df_group['Export'].sum())
    ser_ww_ww_by_group = ser_ww_ww_by_group.groupby('Commodity_Group_Code').transform(lambda ser_i: ser_i / ser_i.sum())    
    ### Adding values for missed Industry Groups:
    df_ww_ww_by_group = ser_ww_ww_by_group.unstack('GICS_Group_Code').fillna(0.0).reindex(ser_gics_substitution.index).fillna(0.0)
    for iter_group in ser_gics_substitution.index:
        if (df_ww_ww_by_group.loc[iter_group].sum() == 0.0):
            if (ser_gics_substitution[iter_group] == 'None'):
                df_ww_ww_by_group.loc[iter_group] = ser_ww_to_ww_old[iter_group].values
            else:
                if (df_ww_ww_by_group.loc[ser_gics_substitution[iter_group]].sum() == 0.0):
                    df_ww_ww_by_group.loc[iter_group] = ser_ww_to_ww_old[iter_group].values
                else:
                    df_ww_ww_by_group.loc[iter_group] = df_ww_ww_by_group.loc[ser_gics_substitution[iter_group]].values
    ser_ww_ww_by_group = df_ww_ww_by_group.stack('GICS_Group_Code').sort_index()    
    ### Saving of augmented dataset:
    dict_ww_ww_by_group[iter_date] = ser_ww_ww_by_group    
    display((ser_ww_ww_by_group.unstack('GICS_Group_Code') * 100).round(0))
#    break
### Matrices concatenation:
ser_ww_ww_history = pd.concat(dict_ww_ww_by_group, axis = 0, names = ['Date']).sort_index()#.unstack('GICS_Group_Code')
ser_ww_ww_history.name = 'Share'

In [29]:
### TEMP

ser_ww_ww_history

Date        Commodity_Group_Code  GICS_Group_Code
1993-12-31  1010                  1010               0.232255
                                  1510               0.224082
                                  2010               0.047613
                                  2020               0.016520
                                  2030               0.114915
                                                       ...   
2021-12-31  6010                  4530               0.000000
                                  5010               0.021254
                                  5020               0.006592
                                  5510               0.006345
                                  6010               0.043864
Name: Share, Length: 16704, dtype: float64

In [8]:
### RESULTS SAVING

ser_ww_ww_history.to_excel('Data_Files/Test_Files/ww_to_ww_shares.xlsx', merge_cells = False)
ser_ww_ww_history.to_csv('Data_Files/Test_Files/ww_to_ww_shares.csv', sep = ',')

In [None]:
### MAPPINGS PREPARATION

In [42]:
### PRIMARY CELLS PREPARATION

### Basic total shares loading:
ser_total_shares = pd.read_excel(str_path_primary, str_sheet_cells, header = 0).drop('Group', axis = 1).drop([0, 55, 56], axis = 0).set_index('WIOT').stack()
ser_total_shares.index.names = ['WIOT Exporter Code', 'WIOT Importer Code']
### Extracting primary cells:
ser_primary_shares = ser_total_shares[ser_total_shares > 0.05]
ser_primary_shares.name = 'Share'
### R Map:
df_r_map = pd.read_excel(str_path_matrix_map, str_sheet_nace_r, dtype = str)[['WIOT Exporter Code', 'HS Goods Code', 'EBOPS Service Code', 'Commodity Description']]
df_r_map['WIOT Exporter Code'] = df_r_map['WIOT Exporter Code'].ffill()
df_r_map['Commodity_ID'] = df_r_map['HS Goods Code'].combine_first(df_r_map['EBOPS Service Code'])
df_r_map = df_r_map.set_index('WIOT Exporter Code')[['Commodity_ID', 'Commodity Description']].dropna()
### C Map:
df_c_map = pd.read_excel(str_path_matrix_map, str_sheet_nace_c, dtype = str)[['WIOT Importer Code', 'GICS Sub-Industry Code', 'GICS Sub-Industry Name']].dropna()
df_c_map['WIOT Importer Code'] = df_c_map['WIOT Importer Code'].ffill()
df_c_map = df_c_map.dropna(subset = ['GICS Sub-Industry Code']).set_index('WIOT Importer Code')
df_c_map['GICS Sub-Industry Code'] = df_c_map['GICS Sub-Industry Code'].astype(str)
### CMappers consolidation:
df_primary_shares = ser_primary_shares.reset_index('WIOT Importer Code').merge(df_r_map, left_on = 'WIOT Exporter Code', right_on = 'WIOT Exporter Code')\
                                      .set_index('WIOT Importer Code', append = True)
df_primary_shares = df_primary_shares.reset_index('WIOT Exporter Code').merge(df_c_map, left_on = 'WIOT Importer Code', right_on = 'WIOT Importer Code')\
                                      .set_index('WIOT Exporter Code', append = True)
df_primary_shares = df_primary_shares.swaplevel().sort_index()

  warn(msg)


In [43]:
### UN COMTRADE TO GICS WIOT BASED MATRIX LOADING

### Source table loading:
df_unc_to_gics = pd.read_excel(engine = 'openpyxl', io = str_path_matrix_map, sheet_name = str_sheet_matrix, dtype = str, header = list(range(6)), 
                               index_col = list(range(4)))
df_unc_to_gics.index.names = ['WIOT Exporter Code', 'WIOT_Description', 'Commodity_ID', 'Commodity_Description']
df_unc_to_gics.columns.names = ['WIOT Importer Code', 'WIOT_Description', 'GICS_Sub_Code', 'GICS_Industry_Code', 'GICS_Group_Code', 'GICS_Sub_Name']
### Index levels checker:
df_unc_to_gics.index = df_unc_to_gics.index.set_levels(df_unc_to_gics.index.levels[2].astype('str'), level = 'Commodity_ID')
#### Matrix filtering:
df_unc_to_gics = df_unc_to_gics.drop(index = '---', level = 'Commodity_ID').drop(columns = '---', level = 'GICS_Group_Code')
df_unc_to_gics = df_unc_to_gics.droplevel(['WIOT_Description', 'GICS_Industry_Code', 'GICS_Group_Code', 'GICS_Sub_Name'], axis = 1)
df_unc_to_gics = df_unc_to_gics.droplevel(['WIOT_Description', 'Commodity_Description'])
#df_unc_to_gics['GICS_Sub_Code'] = df_unc_to_gics['GICS_Sub_Code'].astype(str)
### Matrix convertation:
df_unc_to_gics = df_unc_to_gics.replace({'x': False, 'y': True, 'z': True})
gc.collect()
ser_unc_to_gics = df_unc_to_gics.stack(df_unc_to_gics.columns.names).astype(bool)
ser_unc_to_gics.index = ser_unc_to_gics.index.set_levels(ser_unc_to_gics.index.levels[3].astype('str'), level = 'GICS_Sub_Code')
ser_unc_to_gics.name = 'Connection_Flag'

In [44]:
### ADDING CONNECTION FLAGS

df_primary_shares.index.names = ['WIOT Exporter Code', 'WIOT Importer Code']
df_primary_shares = df_primary_shares.set_index(['Commodity_ID', 'GICS Sub-Industry Code'], append = True).reorder_levels([0, 2, 1, 3]).sort_index()
df_primary_shares.index.names = ser_unc_to_gics.index.names
df_primary_shares['Connection_Flag'] = ser_unc_to_gics.reindex(df_primary_shares.index)
df_primary_shares = df_primary_shares.set_index(['Commodity Description', 'GICS Sub-Industry Name'], append = True).reorder_levels([0, 1, 4, 2, 3, 5]).sort_index()

In [49]:
### HS19992 STRUCTURE PREPARATION:

### Data loading:
df_hs_full = pd.DataFrame(pd.read_json(str_hs_url, orient = 'index').squeeze()['results']).set_index('id')
### Parents & Offspring:
ser_ag2 = df_hs_full[df_hs_full['parent'] == 'TOTAL'].drop('parent', axis = 1).squeeze()
ser_ag2.name = 'AG2_Description'
df_ag4 = df_hs_full[df_hs_full['parent'].isin(ser_ag2.index)]
df_ag4.columns = ['AG4_Description', 'Parent']
df_ag4 = df_ag4.merge(ser_ag2, left_on = 'Parent', right_on = 'id').set_index('Parent')[['AG2_Description', 'AG4_Description']]
df_ag4.to_excel('Data_Files/Test_Files/hs_1992_ag4.xlsx')