In [1]:
### WIOT MATRICES CONVERTATION

In [2]:
### INITIALIZATION

import pandas as pd
import numpy as np
import os
import gc

In [3]:
### RUN EVERY TIME: VERSION CONTROL

from platform import python_version
print('python version: ', python_version())
print('numpy version: ', np.__version__)
print('pandas version: ', pd.__version__)

python version:  3.7.4
numpy version:  1.17.2
pandas version:  0.25.3


In [4]:
### PARAMETERS

### MultiIndex level slice constant:
All = slice(None)
### NA for MS Excel files:
list_na_excel_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null',
                        '#N/A Requesting Data...', '#N/A Invalid Security', '#N/A Field Not Applicable']
### Industry mapping path:
str_path_industry_map = 'Data_Files/Source_Files/WIOT_to_GICS_mapping.xlsx'
str_sheet_mapping = 'WIOT Mapping'
str_sheet_full_gics = 'GICS List'
### Path to original WIOT Tables:
str_path_wiot_source = 'Data_Files/Source_Files/WIOT'
### Downloaded and aggregated shares:
str_path_totals_hdf = 'Data_Files/Result_Files/gics_totals_share.h5'
str_path_flows_hdf = 'Data_Files/Result_Files/gics_flows_share.h5'
str_gics_key = 'gics_share'

In [5]:
### WIOD TO GICS INDUSTRY MAPPING

dict_industry_mapper = pd.read_excel(engine = 'openpyxl', io = str_path_industry_map, sheet_name = str_sheet_mapping, header = [0], index_col = 0)\
                         .squeeze().astype(str).to_dict()
dict_gics_sub = pd.read_excel(engine = 'openpyxl', io = str_path_industry_map, sheet_name = str_sheet_full_gics, header = [0], index_col = None).astype(str)\
                  .replace({'None': None}).set_index('Industry Group').squeeze().to_dict()

In [6]:
### DEFINING COUNTRY CODES EXTRACTOR

def get_country_codes(use_local_copy = False):  
    ### In case if URL is unavailable:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/countrycode.html'
    ### Online extraction:
    else:
        url_country_code = 'https://countrycode.org/'
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    df_result = df_full_codes[['ISO SHORT', 'ISO LONG']].sort_index()    
    df_result.index = df_result.index.str.upper()
    ### Results output:
    return df_result

### World Country Codes:
df_country_codes = get_country_codes()
dict_ison_mapper = df_country_codes.set_index('ISO LONG').squeeze().to_dict()
dict_ison_mapper['ROW'] = 'YY'

In [80]:
### DATA TABLES LOADING

dict_flows = {}
dict_totals = {}
for year_matrix_csv in os.listdir(str_path_wiot_source):
#for year_matrix_csv in os.listdir(str_path_wiot_source)[::-1]:    
#for year_matrix_csv in [os.listdir(str_path_wiot_source)[0]] + [os.listdir(str_path_wiot_source)[-1]]:
    gc.collect()
#    dt_year_end = (pd.to_datetime(year_matrix_csv[4 : 8]) + pd.offsets.BYearEnd()).date()
    str_year_num = year_matrix_csv[4 : 8]
    print(str_year_num)
    df_year_raw = pd.read_csv(str_path_wiot_source + '/' + year_matrix_csv, sep = ';', index_col = [0, 1, 2, 3], skiprows = [0, 1, 2, 3], header = [0, 1], 
                                        na_values = list_na_excel_values, keep_default_na = False)
    ser_year_raw = df_year_raw.droplevel([0, 1]).stack([0, 1]).astype('int32')
    ser_year_raw.name = 'Value'
    ser_year_raw.index.names = ['Supplier_Country_Long', 'Supplier_Industry_r_named', 'User_Country_Long', 'User_Industry_c_named']
    df_year_typed = ser_year_raw.reset_index()
    del df_year_raw
    del ser_year_raw
    gc.collect()
    df_year_typed = df_year_typed[df_year_typed['Supplier_Country_Long'].isin(df_country_codes['ISO LONG'].to_list() + ['ROW']) & 
                                  df_year_typed['User_Country_Long'].isin(df_country_codes['ISO LONG'].to_list() + ['ROW'])]
    df_year_typed['Supplier_Country'] = df_year_typed['Supplier_Country_Long'].replace(dict_ison_mapper)
    df_year_typed['User_Country'] = df_year_typed['User_Country_Long'].replace(dict_ison_mapper)    
    df_year_typed['User_Industry_r_named'] = df_year_typed['User_Industry_c_named'].str.replace('c', 'r')
    df_year_typed = df_year_typed[df_year_typed['Supplier_Industry_r_named'].isin(dict_industry_mapper.keys()) & 
                                  df_year_typed['User_Industry_r_named'].isin(dict_industry_mapper.keys())]    
    df_year_typed['User_Industry'] = df_year_typed['User_Industry_r_named'].replace(dict_industry_mapper)
    df_year_typed['Supplier_Industry'] = df_year_typed['Supplier_Industry_r_named'].replace(dict_industry_mapper)
    df_year_typed['Supplier_Country'] = df_year_typed['Supplier_Country'].astype('category')
    df_year_typed['User_Country'] = df_year_typed['User_Country'].astype('category')
    df_year_typed['User_Industry'] = df_year_typed['User_Industry'].astype('category')
    df_year_typed['Supplier_Industry'] = df_year_typed['Supplier_Industry'].astype('category')
    df_year_typed = df_year_typed[['Supplier_Country', 'User_Country', 'Supplier_Industry', 'User_Industry', 'Value']]
    ser_year_gics = df_year_typed.groupby(['Supplier_Country', 'User_Country', 'Supplier_Industry', 'User_Industry']).sum().squeeze().sort_index()
    ser_flow_share = ser_year_gics.groupby(['Supplier_Country', 'User_Country', 'Supplier_Industry'])\
                                  .apply(lambda ser_group: ser_group / ser_group.sum()).sort_index()
    ser_year_total = ser_year_gics.groupby(['Supplier_Industry', 'User_Industry']).sum()
    ser_total_share = ser_year_total.groupby('Supplier_Industry').apply(lambda ser_group: ser_group / ser_group.sum()).sort_index()
    dict_flows[str_year_num] = ser_flow_share
    dict_totals[str_year_num] = ser_total_share    
    break

2000


In [11]:
### DATA AGGREGATION & SAVING

gc.collect()

ser_totals = pd.concat(dict_totals, names = ['Date'])
ser_flows = pd.concat(dict_flows, names = ['Date'])

#ser_totals.to_hdf(str_path_totals_hdf, key = str_gics_key, mode = 'w', format = 'table', complevel = 9)
#ser_flows.to_hdf(str_path_flows_hdf, key = str_gics_key, mode = 'w', format = 'table', complevel = 9)

In [7]:
### DATA LOADING

gc.collect()

ser_totals = pd.read_hdf(str_path_totals_hdf, key = str_gics_key)
ser_flows = pd.read_hdf(str_path_flows_hdf, key = str_gics_key, where = "Date in ['2000', '2014']")

Date  Supplier_Country  User_Country  Supplier_Industry  User_Industry
2014  AT                AT            1010               1010             0.021919
                                                         1510             0.202607
                                                         2010             0.123223
                                                         2020             0.045616
                                                         2030             0.267773
                                                                            ...   
      YY                YY            6010               5010             0.024462
                                                         5020             0.001618
                                                         5510             0.003282
                                                         6010             0.053618
                                                         9999             0.000000
Name: Value, Len

In [10]:
### TEST: COMPARISION WITH ORIGINAL RESULTS

for year_matrix_csv in os.listdir(str_path_wiot_source)[::-1]:
    gc.collect()
#    dt_year_end = (pd.to_datetime(year_matrix_csv[4 : 8]) + pd.offsets.BYearEnd()).date()
    str_year_num = year_matrix_csv[4 : 8]
    print(str_year_num)
    df_year_raw = pd.read_csv(str_path_wiot_source + '/' + year_matrix_csv, sep = ';', index_col = [0, 1, 2, 3], skiprows = [0, 1, 2, 3], header = [0, 1], 
                                        na_values = list_na_excel_values, keep_default_na = False)
    ser_year_raw = df_year_raw.droplevel([0, 1]).stack([0, 1]).astype('int32')
    ser_year_raw.name = 'Value'
    ser_year_raw.index.names = ['Supplier_Country_Long', 'Supplier_Industry_r_named', 'User_Country_Long', 'User_Industry_c_named']
    df_year_typed = ser_year_raw.reset_index()
    del df_year_raw
    del ser_year_raw
    gc.collect()
    df_year_typed = df_year_typed[df_year_typed['Supplier_Country_Long'].isin(df_country_codes['ISO LONG'].to_list() + ['ROW']) & 
                                  df_year_typed['User_Country_Long'].isin(df_country_codes['ISO LONG'].to_list() + ['ROW'])]
    df_year_typed['Supplier_Country'] = df_year_typed['Supplier_Country_Long'].replace(dict_ison_mapper)
    df_year_typed['User_Country'] = df_year_typed['User_Country_Long'].replace(dict_ison_mapper)    
    df_year_typed['User_Industry_r_named'] = df_year_typed['User_Industry_c_named'].str.replace('c', 'r')
    df_year_typed = df_year_typed[df_year_typed['Supplier_Industry_r_named'].isin(dict_industry_mapper.keys()) & 
                                  df_year_typed['User_Industry_r_named'].isin(dict_industry_mapper.keys())]     
    break

2014


In [11]:
### TEST: COMPARISION WITH ORIGINAL RESULTS

print(ser_totals['2014'])

df_test = df_year_typed[df_year_typed['Supplier_Industry_r_named'] == 'r10']\
                .drop(['Supplier_Country_Long', 'User_Country_Long', 'Supplier_Industry_r_named'], axis = 1)
ser_test = df_test.groupby(['User_Industry_c_named'])['Value'].sum()
ser_test = ser_test / ser_test.sum()
ser_test[['c2', 'c4', 'c7', 'c8', 'c11', 'c13', 'c14', 'c15', 'c16']].sum()

Supplier_Industry  User_Industry
1010               1010             0.106129
                   1510             0.266885
                   2010             0.142933
                   2020             0.038144
                   2030             0.234523
                                      ...   
9999               5010             0.044650
                   5020             0.041054
                   5510             0.013160
                   6010             0.051007
                   9999             0.001555
Name: Value, Length: 441, dtype: float64


0.2668847964453599

In [13]:
### TEMP

gc.collect()

ser_totals.unstack('Date').groupby('Supplier_Industry').corr().loc[(All, '2000'), '2014'].droplevel('Date').sort_values()

Supplier_Industry
9999    0.875926
5510    0.888830
4010    0.899005
4510    0.924310
1010    0.951680
5010    0.974731
6010    0.977956
2530    0.978421
5020    0.981946
3520    0.983347
4020    0.984673
2020    0.985048
4030    0.991998
1510    0.994211
2030    0.994339
2550    0.995379
2510    0.995824
2010    0.996508
3510    0.997390
2520    0.998495
3020    0.998793
Name: 2014, dtype: float64

In [26]:
### TEST: USAGE BY COUNTRY

gc.collect()

#ser_us_usage = ser_flows.loc[:, :, 'US', :, :]
#ser_us_usage.unstack('Date').groupby([''])
ser_flows_corr = ser_flows.unstack('Date').groupby(['Supplier_Country', 'Supplier_Industry', 'User_Country']).corr()\
                         .loc[(All, All, All, '2000'), '2014'].droplevel(-1).dropna().squeeze()

In [32]:
### TEST: USAGE BY COUNTRY

gc.collect()

display(ser_flows_corr[:, :, 'US'].groupby('Supplier_Industry').mean())
display(ser_flows_corr[:, :, 'CN'].groupby('Supplier_Industry').mean())

Supplier_Industry
1010    0.863275
1510    0.943401
2010    0.975733
2020    0.837579
2030    0.930328
2510    0.973768
2520    0.718107
2530    0.741221
2550    0.856330
3020    0.956410
3510    0.774251
3520    0.957932
4010    0.751328
4020    0.791415
4030    0.958393
4510    0.768470
5010    0.756744
5020    0.664567
5510    0.801528
6010    0.925030
9999    0.643010
Name: 2014, dtype: float64

Supplier_Industry
1010    0.854463
1510    0.935912
2010    0.892659
2020    0.795444
2030    0.793187
2510    0.889623
2520    0.857885
2530    0.449694
2550    0.588704
3020    0.877410
3510    0.618306
3520    0.805080
4010    0.504852
4020    0.143882
4030    0.538165
4510    0.423300
5010    0.718640
5020    0.607631
5510    0.831504
6010    0.256072
Name: 2014, dtype: float64

In [97]:
### SUPPLY DISTRIBUTION:

str_supplier = 'BY'
str_user = 'KR'
str_export_industry = '9999'
str_year = '2010'
#def supply_distrib(str_supplier, str_user, str_export_industry, str_year = '2014'):
if True:
    if str_export_industry not in dict_industry_mapper.values():
        str_export_industry = dict_gics_sub[str_export_industry]
    ser_flow = pd.read_hdf(str_path_flows_hdf, key = str_gics_key, where = "(Date in [str_year])").droplevel('Date')

In [99]:
### TEMP

ser_flow[str_supplier, str_user, str_export_industry]

IndexingError: Too many indexers