In [1]:
### WIOT MATRICES CONVERTATION

In [2]:
### INITIALIZATION

import pandas as pd
import numpy as np
import os
import gc

In [3]:
### RUN EVERY TIME: VERSION CONTROL

from platform import python_version
print('python version: ', python_version())
print('numpy version: ', np.__version__)
print('pandas version: ', pd.__version__)

python version:  3.7.4
numpy version:  1.17.2
pandas version:  0.25.3


In [4]:
### PARAMETERS

### MultiIndex level slice constant:
All = slice(None)
### NA for MS Excel files:
list_na_excel_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null',
                        '#N/A Requesting Data...', '#N/A Invalid Security', '#N/A Field Not Applicable']
### Industry mapping path:
str_path_industry_map = 'Data_Files/Source_Files/WIOT_to_GICS_mapping.xlsx'
str_sheet_mapping = 'WIOT Mapping'
str_sheet_full_gics = 'GICS List old' # 'GICS List'
### Path to original WIOT Tables:
str_path_wiot_source = 'Data_Files/Source_Files/WIOT'
### Downloaded and aggregated shares:
str_path_gics_hdf = 'Data_Files/Result_Files/gics_flows_value.h5'
str_path_flows_hdf = 'Data_Files/Result_Files/gics_flows_share.h5'
str_path_flows_filled_hdf = 'Data_Files/Result_Files/gics_flows_filled.h5'
str_gics_key = 'gics_io'

In [5]:
### WIOD TO GICS INDUSTRY MAPPING

dict_industry_mapper = pd.read_excel(engine = 'openpyxl', io = str_path_industry_map, sheet_name = str_sheet_mapping, header = [0], index_col = 0)\
                         .squeeze().astype(str).to_dict()
dict_gics_sub = pd.read_excel(engine = 'openpyxl', io = str_path_industry_map, sheet_name = str_sheet_full_gics, header = [0], index_col = None).astype(str)\
                  .replace({'None': None}).set_index('Industry Group').squeeze().to_dict()

In [6]:
### DEFINING COUNTRY CODES EXTRACTOR

def get_country_codes(use_local_copy = False):  
    ### In case if URL is unavailable:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/countrycode.html'
    ### Online extraction:
    else:
        url_country_code = 'https://countrycode.org/'
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    df_result = df_full_codes[['ISO SHORT', 'ISO LONG']].sort_index()    
    df_result.index = df_result.index.str.upper()
    ### Results output:
    return df_result

### World Country Codes:
df_country_codes = get_country_codes()
dict_ison_mapper = df_country_codes.set_index('ISO LONG').squeeze().to_dict()
dict_ison_mapper['ROW'] = 'YY'

In [None]:
### DATA TABLES LOADING & CONVERTATION

dict_flows = {}
#for year_matrix_csv in os.listdir(str_path_wiot_source):
for year_matrix_csv in os.listdir(str_path_wiot_source)[::-1]:    
#for year_matrix_csv in [os.listdir(str_path_wiot_source)[0]] + [os.listdir(str_path_wiot_source)[-1]]:
    gc.collect()
#    dt_year_end = (pd.to_datetime(year_matrix_csv[4 : 8]) + pd.offsets.BYearEnd()).date()
    str_year_num = year_matrix_csv[4 : 8]
    print(str_year_num)
    df_year_raw = pd.read_csv(str_path_wiot_source + '/' + year_matrix_csv, sep = ';', index_col = [0, 1, 2, 3], skiprows = [0, 1, 2, 3], header = [0, 1], 
                                        na_values = list_na_excel_values, keep_default_na = False)
    ser_year_raw = df_year_raw.droplevel([0, 1]).stack([0, 1]).astype('int32')
    ser_year_raw.name = 'Value'
    ser_year_raw.index.names = ['Supplier_Country_Long', 'Supplier_Industry_r_named', 'User_Country_Long', 'User_Industry_c_named']
    df_year_typed = ser_year_raw.reset_index()
    del df_year_raw
    del ser_year_raw
    gc.collect()
    df_year_typed = df_year_typed[df_year_typed['Supplier_Country_Long'].isin(df_country_codes['ISO LONG'].to_list() + ['ROW']) & 
                                  df_year_typed['User_Country_Long'].isin(df_country_codes['ISO LONG'].to_list() + ['ROW'])]
    df_year_typed['Supplier_Country'] = df_year_typed['Supplier_Country_Long'].replace(dict_ison_mapper)
    df_year_typed['User_Country'] = df_year_typed['User_Country_Long'].replace(dict_ison_mapper)    
    df_year_typed['User_Industry_r_named'] = df_year_typed['User_Industry_c_named'].str.replace('c', 'r')
    df_year_typed = df_year_typed[df_year_typed['Supplier_Industry_r_named'].isin(dict_industry_mapper.keys()) & 
                                  df_year_typed['User_Industry_r_named'].isin(dict_industry_mapper.keys())]    
    df_year_typed['User_Industry'] = df_year_typed['User_Industry_r_named'].replace(dict_industry_mapper)
    df_year_typed['Supplier_Industry'] = df_year_typed['Supplier_Industry_r_named'].replace(dict_industry_mapper)
    df_year_typed['Supplier_Country'] = df_year_typed['Supplier_Country'].astype('category')
    df_year_typed['User_Country'] = df_year_typed['User_Country'].astype('category')
    df_year_typed['User_Industry'] = df_year_typed['User_Industry'].astype('category')
    df_year_typed['Supplier_Industry'] = df_year_typed['Supplier_Industry'].astype('category')
    df_year_typed = df_year_typed[['Supplier_Country', 'User_Country', 'Supplier_Industry', 'User_Industry', 'Value']]
    ser_year_gics = df_year_typed.groupby(['Supplier_Country', 'User_Country', 'Supplier_Industry', 'User_Industry']).sum().squeeze().sort_index()
    dict_flows[str_year_num] = ser_year_gics    
#    break

In [9]:
### CONVERTED DATA AGGREGATION & SAVING

gc.collect()

ser_flows = pd.concat(dict_flows, names = ['Date'])
ser_flows.to_hdf(str_path_gics_hdf, key = str_gics_key, mode = 'w', format = 'table', complevel = 9)

In [15]:
### TEST

ser_test = pd.read_hdf(str_path_gics_hdf, key = str_gics_key, where = "Date='2014'").droplevel('Date')
gc.collect()
df_iter_flows = ser_test.reset_index()

In [10]:
### TOTAL VALUES & SHARES CONSTRUCTION

gc.collect()

### Full values dataset loading:
ser_flows = pd.read_hdf(str_path_gics_hdf, key = str_gics_key)
### Containers creating:
dict_flows_share = {}
### Looping over years:
for str_iter_year in ser_flows.index.levels[0][::-1]:
    print(str_iter_year)
    gc.collect()
    df_iter_flows = ser_flows[str_iter_year].reset_index()
    df_iter_flows['Supplier_Country'] = df_iter_flows['Supplier_Country'].cat.add_categories(['WW'])
    df_iter_flows['User_Country'] = df_iter_flows['User_Country'].cat.add_categories(['WW'])    
    ### Inner flows filtering:
    df_inner_flows = df_iter_flows[(df_iter_flows['Supplier_Country'] == df_iter_flows['User_Country']) & 
                                  (df_iter_flows['Supplier_Industry'] != '9999') & (df_iter_flows['User_Industry'] != '9999')]      
    ### Outer flows filtering:
    df_outer_flows = df_iter_flows[(df_iter_flows['Supplier_Country'] != df_iter_flows['User_Country']) & 
                                  (df_iter_flows['Supplier_Industry'] != '9999') & (df_iter_flows['User_Industry'] != '9999')]  
    ### Inner values to series:
    ser_country_inner_values = df_inner_flows.set_index(['Supplier_Country', 'User_Country', 'Supplier_Industry', 'User_Industry']).astype('float32').squeeze()     
    ### Outer values to series:    
    ser_country_to_country_values = df_outer_flows.set_index(['Supplier_Country', 'User_Country', 'Supplier_Industry', 'User_Industry']).astype('float32').squeeze() 
    ### Country to World trade:        
    ser_country_to_world_values = df_outer_flows.groupby(['Supplier_Country', 'Supplier_Industry', 'User_Industry']).sum().squeeze().sort_index()
    ### World to World trade:            
    ser_world_to_world_values = ser_country_to_world_values.groupby(['Supplier_Industry', 'User_Industry']).sum().astype('float32').sort_index()
    ### World to Country trade:                
    ser_world_to_country_values = df_outer_flows.groupby(['User_Country', 'Supplier_Industry', 'User_Industry']).sum().squeeze().sort_index()
    ### Inner shares:
    ser_country_inner_shares = ser_country_inner_values.groupby(['Supplier_Country', 'User_Country', 'Supplier_Industry'])\
                                                       .apply(lambda ser_group: ser_group / ser_group.sum()).astype('float32').sort_index()     
    ### Outer shares:
    ser_country_to_country_shares = ser_country_to_country_values.groupby(['Supplier_Country', 'User_Country', 'Supplier_Industry'])\
                                                                 .apply(lambda ser_group: ser_group / ser_group.sum()).astype('float32').sort_index()     
    ser_country_to_world_shares = ser_country_to_world_values.groupby(['Supplier_Country', 'Supplier_Industry'])\
                                                                 .apply(lambda ser_group: ser_group / ser_group.sum()).astype('float32').sort_index() 
    ser_country_to_world_shares = pd.concat({'WW': ser_country_to_world_shares}, names = ['User_Country']).swaplevel(0, 1).sort_index()
    ser_country_to_world_shares.drop(('WW', 'WW'), inplace = True)
    ser_world_to_country_shares = ser_world_to_country_values.groupby(['User_Country', 'Supplier_Industry'])\
                                                             .apply(lambda ser_group: ser_group / ser_group.sum()).astype('float32').sort_index() 
    ser_world_to_country_shares = pd.concat({'WW': ser_world_to_country_shares}, names = ['Supplier_Country'])    
    ser_world_to_country_shares.drop(('WW', 'WW'), inplace = True)    
    ser_world_to_world_shares = ser_world_to_world_values.groupby(['Supplier_Industry'])\
                                    .apply(lambda ser_group: ser_group / ser_group.sum()).astype('float32').sort_index() 
    ser_world_to_world_shares = pd.concat({'WW': pd.concat({'WW': ser_world_to_world_shares}, names = ['User_Country'])}, names = ['Supplier_Country'])    
    ### Values aggregating:
    ser_year_shares = pd.concat([ser_country_inner_shares, ser_country_to_country_shares, ser_country_to_world_shares, ser_world_to_country_shares, 
                                 ser_world_to_world_shares]).sort_index()
    del ser_country_to_country_shares
    gc.collect()
    ser_year_shares.drop('9999', axis = 0, level = 'Supplier_Industry', inplace =  True)
    ser_year_shares.drop('9999', axis = 0, level = 'User_Industry', inplace =  True)    
    dict_flows_share[str_iter_year] = ser_year_shares
    
    break

2014


In [7]:
### BILATERAL SHARES SAVING

ser_all_shares = pd.concat(dict_flows_share)
del dict_flows_share
gc.collect()
ser_all_shares.index.names = ['Date', 'Supplier_Country', 'User_Country', 'Supplier_Industry', 'User_Industry']
ser_all_shares.name = 'Share'

ser_all_shares.to_hdf(str_path_flows_hdf, key = str_gics_key, mode = 'w', format = 'table', complevel = 9)

In [11]:
### INDUSTRIES REINDEXATION FUNCTION

def industry_reindexation(ser_matrix):
    ser_reindexed = ser_matrix.unstack('Supplier_Industry').reindex(dict_gics_sub.keys()).stack(dropna = False)\
                               .unstack('User_Industry').reindex(dict_gics_sub.keys()).stack(dropna = False).sort_index()
    for iter_industry in dict_gics_sub:
        if (dict_gics_sub[iter_industry] is not None):
            ser_reindexed.loc[iter_industry] = ser_reindexed[dict_gics_sub[iter_industry]].values
    ser_reindexed.fillna(0.0, inplace = True)
    return ser_reindexed

In [12]:
### BILATERAL LEVEL INDUSTRIES REINDEXATION

gc.collect()
ser_flows_share = pd.read_hdf(str_path_flows_hdf, where = "Date='2014'").droplevel('Date')
ser_flows_last = ser_flows_share.groupby(['Supplier_Country', 'User_Country'])\
                                .apply(lambda ser_group: industry_reindexation(ser_group.droplevel(['Supplier_Country', 'User_Country'])))

In [13]:
### FILLING EMPTY BILATERAL SHARES

gc.collect()
dict_counter = {}
dict_counter['country_to_country'] = 0
dict_counter['country_to_world'] = 0
dict_counter['world_to_country'] = 0
dict_counter['world_to_world'] = 0
dict_counter['empty'] = 0

for iter_supplier in ser_flows_last.index.levels[0]:
    for iter_user in ser_flows_last.index.levels[1]:
        for iter_industry in ser_flows_last.index.levels[2]:
            ser_country_to_country = ser_flows_last.loc[iter_supplier, iter_user, iter_industry]
            ser_country_to_world = ser_flows_last.loc[iter_supplier, 'WW', iter_industry]
            ser_world_to_country = ser_flows_last.loc['WW', iter_user, iter_industry]
            ser_world_to_world = ser_flows_last.loc['WW', 'WW', iter_industry]
            if (sum(ser_country_to_country) == 0.0):
                if (sum(ser_country_to_world) == 0.0):
                    if (sum(ser_world_to_country) == 0.0):
                        if (sum(ser_world_to_world) == 0.0):
                            dict_counter['empty'] += 1                    
#                            print(iter_supplier, iter_user, iter_industry, ': no data to fill')                            
                        else:
                            ser_flows_last.loc[iter_supplier, iter_user, iter_industry] = ser_world_to_world.values
                            dict_counter['world_to_world'] += 1
#                            print(iter_supplier, iter_user, iter_industry, ': filled by World to World level data')                            
                    else:
                        ser_flows_last.loc[iter_supplier, iter_user, iter_industry] = ser_world_to_country.values
                        dict_counter['world_to_country'] += 1
#                        print(iter_supplier, iter_user, iter_industry, ': filled by World to Country level data')                        
                else:
                    ser_flows_last.loc[iter_supplier, iter_user, iter_industry] = ser_country_to_world.values
                    dict_counter['country_to_world'] += 1
#                    print(iter_supplier, iter_user, iter_industry, ': filled by Country to World level data')
            else:
                dict_counter['country_to_country'] += 1
print(dict_counter)

{'country_to_country': 27491, 'country_to_world': 18890, 'world_to_country': 2010, 'world_to_world': 209, 'empty': 0}


In [14]:
### TEMP

for iter_option in dict_counter:
    print(iter_option, ':', round(dict_counter[iter_option] / len(ser_flows_last.groupby(['Supplier_Country', 'User_Country', 'Supplier_Industry']).count()), 4))

country_to_country : 0.5657
country_to_world : 0.3887
world_to_country : 0.0414
world_to_world : 0.0043
empty : 0.0


In [15]:
### REINDEXED AND FILLED FLOW SHARES SAVING

gc.collect()

ser_flows_last.to_hdf(str_path_flows_filled_hdf, key = str_gics_key, mode = 'w', format = 'table', complevel = 9)

In [7]:
### TEMP

ser_flows_last = pd.read_hdf(str_path_flows_filled_hdf)

In [9]:
### TEMP

ser_flows_last['WW', 'WW'].unstack('User_Industry').to_csv('total_shares_v2.csv', sep = ',')