In [1]:
### WIOT MATRICES CONVERTATION

In [2]:
### INITIALIZATION

import pandas as pd
import numpy as np
import os
import gc

In [3]:
### RUN EVERY TIME: VERSION CONTROL

from platform import python_version
print('python version: ', python_version())
print('numpy version: ', np.__version__)
print('pandas version: ', pd.__version__)

python version:  3.7.4
numpy version:  1.17.2
pandas version:  0.25.3


In [4]:
### PARAMETERS

### MultiIndex level slice constant:
All = slice(None)
### NA for MS Excel files:
list_na_excel_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null',
                        '#N/A Requesting Data...', '#N/A Invalid Security', '#N/A Field Not Applicable']
### Industry mapping path:
str_path_industry_map = 'Data_Files/Source_Files/WIOT_to_GICS_mapping.xlsx'
str_sheet_mapping = 'WIOT Mapping'
str_sheet_full_gics = 'GICS List'
### Path to original WIOT Tables:
str_path_wiot_source = 'Data_Files/Source_Files/WIOT'
### Downloaded and aggregated shares:
str_path_gics_hdf = 'Data_Files/Result_Files/gics_flows_value.h5'
str_path_flows_hdf = 'Data_Files/Result_Files/gics_flows_share.h5'
str_path_country_hdf = 'Data_Files/Result_Files/gics_country_share.h5'
str_path_world_hdf = 'Data_Files/Result_Files/gics_world_share.h5'
str_gics_key = 'gics_io'

In [5]:
### WIOD TO GICS INDUSTRY MAPPING

dict_industry_mapper = pd.read_excel(engine = 'openpyxl', io = str_path_industry_map, sheet_name = str_sheet_mapping, header = [0], index_col = 0)\
                         .squeeze().astype(str).to_dict()
dict_gics_sub = pd.read_excel(engine = 'openpyxl', io = str_path_industry_map, sheet_name = str_sheet_full_gics, header = [0], index_col = None).astype(str)\
                  .replace({'None': None}).set_index('Industry Group').squeeze().to_dict()

In [6]:
### DEFINING COUNTRY CODES EXTRACTOR

def get_country_codes(use_local_copy = False):  
    ### In case if URL is unavailable:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/countrycode.html'
    ### Online extraction:
    else:
        url_country_code = 'https://countrycode.org/'
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    df_result = df_full_codes[['ISO SHORT', 'ISO LONG']].sort_index()    
    df_result.index = df_result.index.str.upper()
    ### Results output:
    return df_result

### World Country Codes:
df_country_codes = get_country_codes()
dict_ison_mapper = df_country_codes.set_index('ISO LONG').squeeze().to_dict()
dict_ison_mapper['ROW'] = 'YY'

In [None]:
### DATA TABLES LOADING & CONVERTATION

dict_flows = {}
#for year_matrix_csv in os.listdir(str_path_wiot_source):
for year_matrix_csv in os.listdir(str_path_wiot_source)[::-1]:    
#for year_matrix_csv in [os.listdir(str_path_wiot_source)[0]] + [os.listdir(str_path_wiot_source)[-1]]:
    gc.collect()
#    dt_year_end = (pd.to_datetime(year_matrix_csv[4 : 8]) + pd.offsets.BYearEnd()).date()
    str_year_num = year_matrix_csv[4 : 8]
    print(str_year_num)
    df_year_raw = pd.read_csv(str_path_wiot_source + '/' + year_matrix_csv, sep = ';', index_col = [0, 1, 2, 3], skiprows = [0, 1, 2, 3], header = [0, 1], 
                                        na_values = list_na_excel_values, keep_default_na = False)
    ser_year_raw = df_year_raw.droplevel([0, 1]).stack([0, 1]).astype('int32')
    ser_year_raw.name = 'Value'
    ser_year_raw.index.names = ['Supplier_Country_Long', 'Supplier_Industry_r_named', 'User_Country_Long', 'User_Industry_c_named']
    df_year_typed = ser_year_raw.reset_index()
    del df_year_raw
    del ser_year_raw
    gc.collect()
    df_year_typed = df_year_typed[df_year_typed['Supplier_Country_Long'].isin(df_country_codes['ISO LONG'].to_list() + ['ROW']) & 
                                  df_year_typed['User_Country_Long'].isin(df_country_codes['ISO LONG'].to_list() + ['ROW'])]
    df_year_typed['Supplier_Country'] = df_year_typed['Supplier_Country_Long'].replace(dict_ison_mapper)
    df_year_typed['User_Country'] = df_year_typed['User_Country_Long'].replace(dict_ison_mapper)    
    df_year_typed['User_Industry_r_named'] = df_year_typed['User_Industry_c_named'].str.replace('c', 'r')
    df_year_typed = df_year_typed[df_year_typed['Supplier_Industry_r_named'].isin(dict_industry_mapper.keys()) & 
                                  df_year_typed['User_Industry_r_named'].isin(dict_industry_mapper.keys())]    
    df_year_typed['User_Industry'] = df_year_typed['User_Industry_r_named'].replace(dict_industry_mapper)
    df_year_typed['Supplier_Industry'] = df_year_typed['Supplier_Industry_r_named'].replace(dict_industry_mapper)
    df_year_typed['Supplier_Country'] = df_year_typed['Supplier_Country'].astype('category')
    df_year_typed['User_Country'] = df_year_typed['User_Country'].astype('category')
    df_year_typed['User_Industry'] = df_year_typed['User_Industry'].astype('category')
    df_year_typed['Supplier_Industry'] = df_year_typed['Supplier_Industry'].astype('category')
    df_year_typed = df_year_typed[['Supplier_Country', 'User_Country', 'Supplier_Industry', 'User_Industry', 'Value']]
    ser_year_gics = df_year_typed.groupby(['Supplier_Country', 'User_Country', 'Supplier_Industry', 'User_Industry']).sum().squeeze().sort_index()
#    ser_flow_share = ser_year_gics.groupby(['Supplier_Country', 'User_Country', 'Supplier_Industry'])\
#                                  .apply(lambda ser_group: ser_group / ser_group.sum()).sort_index()
#    ser_flow_share  = ser_flow_share.astype('float32')
    dict_flows[str_year_num] = ser_year_gics    

#    break

In [9]:
### CONVERTED DATA AGGREGATION & SAVING

gc.collect()

ser_flows = pd.concat(dict_flows, names = ['Date'])
ser_flows.to_hdf(str_path_gics_hdf, key = str_gics_key, mode = 'w', format = 'table', complevel = 9)

In [None]:
### TOTAL VALUES CONSTRUCTION

gc.collect()

ser_flows = pd.read_hdf(str_path_gics_hdf, key = str_gics_key)

dict_flows_share = {}
dict_total_share = {}
dict_world_share = {}

for str_iter_year in ser_flows.index.levels[0]:
    print(str_iter_year)
    gc.collect()
    df_iter_flows = ser_flows[str_iter_year].reset_index()
    df_iter_flows = df_iter_flows[(df_iter_flows['Supplier_Country'] != df_iter_flows['User_Country']) & 
                                  (df_iter_flows['Supplier_Industry'] != '9999') & (df_iter_flows['User_Industry'] != '9999')]  
    ser_flows_share = df_iter_flows.set_index(['Supplier_Country', 'User_Country', 'Supplier_Industry', 'User_Industry']).astype('float32').squeeze()
    dict_flows_share[str_iter_year] = ser_flows_share.groupby(['Supplier_Country', 'User_Country', 'Supplier_Industry'])\
                                                     .apply(lambda ser_group: ser_group / ser_group.sum()).astype('float32').sort_index()     
    ser_country_total = df_iter_flows.groupby(['Supplier_Country', 'Supplier_Industry', 'User_Industry']).sum().squeeze().sort_index()
    dict_total_share[str_iter_year] = ser_country_total.groupby(['Supplier_Country', 'Supplier_Industry']).apply(lambda ser_group: ser_group / ser_group.sum())\
                                                       .astype('float32').sort_index()
    ser_world_total = ser_country_total.groupby(['Supplier_Industry', 'User_Industry']).sum().astype('float32').sort_index()
    dict_world_share[str_iter_year] = ser_world_total.groupby('Supplier_Industry').apply(lambda ser_group: ser_group / ser_group.sum()).astype('float32').sort_index()
#    break

In [15]:
### WORLD LEVEL AGGREGATED DATA SAVING

ser_world_share = pd.concat(dict_world_share)
ser_world_share.index.names = ['Date', 'Supplier_Industry', 'User_Industry']
ser_world_share.to_hdf(str_path_world_hdf, key = str_gics_key, mode = 'w', format = 'table', complevel = 9)

In [16]:
### COUNTRY LEVEL AGGREGATED DATA SAVING

ser_total_share = pd.concat(dict_total_share)
ser_total_share.index.names = ['Date', 'Supplier_Country', 'Supplier_Industry', 'User_Industry']
ser_total_share.to_hdf(str_path_country_hdf, key = str_gics_key, mode = 'w', format = 'table', complevel = 9)

In [17]:
### BILATERAS SHARES SAVING

ser_flows_share = pd.concat(dict_flows_share)
ser_flows_share.index.names = ['Date', 'Supplier_Country', 'User_Country', 'Supplier_Industry', 'User_Industry']
ser_flows_share.to_hdf(str_path_flows_hdf, key = str_gics_key, mode = 'w', format = 'table', complevel = 9)

In [35]:
### LOADING WORLD TO WORLD SHARES

ser_world_share = pd.read_hdf(str_path_world_hdf)
ser_world_last = ser_world_share['2014']
ser_world_last = ser_world_last.unstack('Supplier_Industry').reindex(dict_gics_sub.keys()).stack(dropna = False)\
                               .unstack('User_Industry').reindex(dict_gics_sub.keys()).stack(dropna = False).sort_index()
for iter_industry in dict_gics_sub:
    if (dict_gics_sub[iter_industry] is not None):
        ser_world_last.loc[iter_industry] = ser_world_last[dict_gics_sub[iter_industry]].values
ser_world_last.fillna(0.0, inplace = True)
ser_world_last.unstack('User_Industry').to_csv('Data_Files/Test_Files/total_shares.csv', sep = ',')

In [17]:
### LOADING COUNTRY TO WORLD SHARES

ser_total_share = pd.read_hdf(str_path_country_hdf)
ser_total_share.loc[['2014'], ['CN'], ['2520']]
ser_total_share.loc[['2014'], ['CN'], ['2520']]

Date  Supplier_Country  Supplier_Industry  User_Industry
2014  CN                2520               1010             0.001204
                                           1510             0.044341
                                           2010             0.165106
                                           2020             0.048786
                                           2030             0.009868
                                           2510             0.034135
                                           2520             0.493182
                                           2530             0.035423
                                           2550             0.027016
                                           3020             0.007242
                                           3510             0.038905
                                           3520             0.002340
                                           4010             0.005388
                                           402

In [40]:
### LOADING BILATERAL SHARES

#ser_flows_share = pd.read_hdf(str_path_flows_hdf)
ser_flows_share.loc[['2014'], ['BR'], ['US'], ['3020'], :]

Date  Supplier_Country  User_Country  Supplier_Industry  User_Industry
2014  BR                US            3020               1010             0.004303
                                                         1510             0.085198
                                                         2010             0.027969
                                                         2020             0.009036
                                                         2030             0.000000
                                                         2510             0.001721
                                                         2520             0.010757
                                                         2530             0.040878
                                                         2550             0.010757
                                                         3020             0.767642
                                                         3510             0.017212
                

In [7]:
### EXTRACTION OF NEEDED DATA (TEST VERSION)

### Bilateral case:
str_year = '2014'
str_supplier = 'BR'
str_user = 'US'
str_industry = '2020'
### Country case:
#str_year = '2014'
#str_supplier = 'AU'
#str_user = 'RU'
#str_industry = '2020'
#### World case:
#str_year = '2014'
#str_supplier = 'MU'
#str_user = 'US'
#str_industry = '2020'
### Not found case:
#str_year = '2014'
#str_supplier = 'CN'
#str_user = 'US'
#str_industry = '1050'
#def shares_extract(str_year, str_supplier, str_user, str_industry):
if True:
    gc.collect()
    ### Year control:
    if int(str_year) < 2000:
        str_year = '2000'
        print('Year changed to:', str_year)
    elif int(str_year) > 2014:
        str_year = '2014'
        print('Year changed to:', str_year)        
    ### Industry control:
    if str_industry in dict_gics_sub:
        if dict_gics_sub[str_industry] is not None:
            str_industry = dict_gics_sub[str_industry]
            print('Industry changed to:', str_industry)
        ### Year flows loading:
        ser_flows_year = pd.read_hdf(str_path_flows_hdf, where = "Date=str_year")
        ### Checking the bilateral dataset:
        ser_bilateral_requested = ser_flows_year.loc[[str_year], [str_supplier], [str_user], [str_industry]].dropna()
        if (len(ser_bilateral_requested) > 0):
            print('Bilateral dataset found:')
            display(ser_bilateral_requested)
#            return ser_bilateral_requested        
        else:
            ### Checking common supplier dataset:
            ser_country_year = pd.read_hdf(str_path_country_hdf, where = "Date=str_year")
            ser_country_requested = ser_country_year.loc[[str_year], [str_supplier], [str_industry]].dropna()
            if (len(ser_country_requested) > 0):
                print('Common supplier dataset found:')            
                display(ser_country_requested)
#                return ser_country_requested 
            else:
                ### Checking world dataset for industry:
                ser_world_year = pd.read_hdf(str_path_world_hdf, where = "Date=str_year")
                ser_world_requested = ser_world_year.loc[[str_year], [str_industry]].dropna()
                if (len(ser_world_requested) > 0):
                    print('World industry dataset found:')            
                    display(ser_world_requested)
#                    return ser_world_requested     
                else:
                    print('Something goes wrong...')
#                    return None
    else:
        print('Wrong Industry identifier')
#        return None

Bilateral dataset found:


Date  Supplier_Country  User_Country  Supplier_Industry  User_Industry
2014  BR                US            2020               1010             0.005626
                                                         1510             0.047820
                                                         2010             0.142053
                                                         2020             0.213783
                                                         2030             0.035162
                                                         2510             0.008439
                                                         2520             0.018284
                                                         2530             0.064698
                                                         2550             0.139241
                                                         3020             0.018284
                                                         3510             0.071730
                

In [8]:
### EXTRACTION OF NEEDED DATA (PRODUCT VERSION)

def shares_extract(str_year, str_supplier, str_user, str_industry):
    gc.collect()
    ### Year control:
    if int(str_year) < 2000:
        str_year = '2000'
        print('Year changed to:', str_year)
    elif int(str_year) > 2014:
        str_year = '2014'
        print('Year changed to:', str_year)        
    ### Industry control:
    if str_industry in dict_gics_sub:
        if dict_gics_sub[str_industry] is not None:
            str_industry = dict_gics_sub[str_industry]
            print('Industry changed to:', str_industry)
        ### Year flows loading:
        ser_flows_year = pd.read_hdf(str_path_flows_hdf, where = "Date=str_year")
        ### Checking the bilateral dataset:
        ser_bilateral_requested = ser_flows_year.loc[[str_year], [str_supplier], [str_user], [str_industry]].dropna()
        if (len(ser_bilateral_requested) > 0):
#            print('Bilateral dataset found:')
#            display(ser_bilateral_requested)
            return ser_bilateral_requested        
        else:
            ### Checking common supplier dataset:
            ser_country_year = pd.read_hdf(str_path_country_hdf, where = "Date=str_year")
            ser_country_requested = ser_country_year.loc[[str_year], [str_supplier], [str_industry]].dropna()
            if (len(ser_country_requested) > 0):
#                print('Common supplier dataset found:')            
#                display(ser_country_requested)
                return ser_country_requested 
            else:
                ### Checking world dataset for industry:
                ser_world_year = pd.read_hdf(str_path_world_hdf, where = "Date=str_year")
                ser_world_requested = ser_world_year.loc[[str_year], [str_industry]].dropna()
                if (len(ser_world_requested) > 0):
#                    print('World industry dataset found:')            
#                    display(ser_world_requested)
                    return ser_world_requested     
                else:
#                    print('Something goes wrong...')
                    return None
    else:
#        print('Wrong Industry identifier')
        return None

In [9]:
### TEST

#### Bilateral case:
#str_year = '2014'
#str_supplier = 'BR'
#str_user = 'US'
#str_industry = '2020'
#### Country case:
#str_year = '2014'
#str_supplier = 'AU'
#str_user = 'RU'
#str_industry = '2020'
#### World case:
#str_year = '2014'
#str_supplier = 'MU'
#str_user = 'US'
#str_industry = '2020'
#### Not found case:
#str_year = '2014'
#str_supplier = 'CN'
#str_user = 'US'
#str_industry = '1050'

shares_extract(str_year, str_supplier, str_user, str_industry)

Date  Supplier_Country  User_Country  Supplier_Industry  User_Industry
2014  BR                US            2020               1010             0.005626
                                                         1510             0.047820
                                                         2010             0.142053
                                                         2020             0.213783
                                                         2030             0.035162
                                                         2510             0.008439
                                                         2520             0.018284
                                                         2530             0.064698
                                                         2550             0.139241
                                                         3020             0.018284
                                                         3510             0.071730
                