In [1]:
### INTERACTION VARIABLE COMPONENTS

In [2]:
### RUN EVERY TIME: INITIALIZATION

import pandas as pd
import numpy as np
import gc
import os
import datetime
import time
import networkx as nx

In [3]:
### RUN EVERY TIME: VERSION CONTROL

from platform import python_version
print('python version: ', python_version())
print('numpy version: ', np.__version__)
print('pandas version: ', pd.__version__)

python version:  3.7.4
numpy version:  1.17.2
pandas version:  0.25.3


In [9]:
### RUN EVERY TIME: MAIN CONSTANTS

### MultiIndex level slice constant:
All = slice(None)
### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### Dates:
str_date_end = '2022-12-31'
date_start = pd.Timestamp('1989-12-29')
date_end = pd.Timestamp(str_date_end)
date_ison = pd.Timestamp('1994-12-31')
### NA for MS Excel files:
list_na_excel_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null',
                        '#N/A Requesting Data...', '#N/A Invalid Security', '#N/A Field Not Applicable', '---']
### Checked EBOPS service IDs list (df_serv_to_gics['GICS Group Code']):
list_services = ['206', '210', '214', '218', '219', '223', '227', '231', '232', '237', '240', '246', '247', '250', '251', '254', '255', '256', '257', '258', '263',
                 '264', '269', '272', '273', '288', '289', '292', '293', '294', '310', '391', '431', '500', '888', '891', '892', '894', '950']
### Augmented bilateral export:
str_path_export_bilateral = 'Data_Files/Source_Files/comtrade_export_bilateral.h5'
### Export key:
str_key_unc_export = 'export_augmented'
### Export Quality Index:
str_path_imf_quality = 'Data_Files/Source_Files/imf_export_quality.h5'
### Export key:
str_key_imf_eq = 'export_quality'
### Augmented bilateral import:
str_path_import_bilateral = 'Data_Files/Source_Files/comtrade_import_bilateral.h5'
### Import key:
str_key_unc_import = 'import_augmented'
### Trade Value Index:
str_path_imf_trade = 'Data_Files/Source_Files/imf_trade_value.h5'
### Export key:
str_key_imf_trade = 'trade_value'
### HS to SITC Conversion Map:
str_path_commodity_map_xlsx = 'Data_Files/Source_Files/hs_to_sitc.xlsx'
str_page_map = 'HS to SITC'
### Augmented bilateral import:
str_path_gdp = 'Data_Files/Source_Files/gdp_dataset.h5'
### Interaction variable file:
str_path_interact_hdf = 'Data_Files/Source_Files/comtrade_interact_exp.h5'
str_path_interact_csv = 'Data_Files/Source_Files/comtrade_interact_exp.csv'
str_key_interact_var = 'interact_var'

In [5]:
### DEFINING WEIGHTED AVERAGE CALCULATOR

def weighted_average(ser_data, ser_weight = None, int_min_count = 0):
    ### Default output:
    num_result = np.NaN
    ### Checking for data presence:
    if (ser_data.count() > int_min_count):       
        ### Checking for weights dataset:
        if ser_weight is None:
            ### Calculating of simple average:
            num_result = np.nanmean(ser_data.values)
        else:
            ### Weights filtering:
            list_weight = ser_weight[ser_data.dropna().index].values
            ### Checking for weights presence:
            if np.nansum(list_weight):
                ### Data filtering:
                list_data = ser_data.dropna().values
                ### Weighted average calculating:
                num_result = np.nansum(list_data * list_weight) / np.nansum(list_weight)
    ### Results output:
    return num_result

In [6]:
### DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE (TO BE IGNORED IN PRODUCT CODE)

def ison_membership_converting(str_path_universe, date_end, bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(engine = 'openpyxl', io = str_path_universe, sheet_name = 'Switchers', header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = list_na_excel_values, keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

In [7]:
### RUN EVERY TIME: COMMON DATA EXTRACTION STEPS

### ISON membership history:
ser_ison_membership = ison_membership_converting(str_path_universe, pd.to_datetime(str_date_end))
ser_ison_membership.index.names = ['Date', 'Country']
### ISON Members:
list_ison_countries = sorted(ser_ison_membership.index.get_level_values('Country').unique())
### ISON status for the last available date:
ser_ison_status = ser_ison_membership.loc[ser_ison_membership.index[-1][0]]

In [8]:
### BILATERAL EXPORT DATA LOADING TO PERFORM FACTOR CALCULATION

gc.collect()
list_export_chunks = []
for num_iter_number, ser_iter_chunk in enumerate(pd.read_hdf(str_path_export_bilateral, key = str_key_unc_export, chunksize = 1000000)):
    gc.collect()
    print(num_iter_number, ': Extraction started')
    ser_iter_chunk = ser_iter_chunk[ser_iter_chunk > 0.0].astype('int32')
    df_iter_chunk = ser_iter_chunk.reset_index()
    df_iter_chunk = df_iter_chunk[(df_iter_chunk['Reporter'] != df_iter_chunk['Partner']) & \
                                  ((df_iter_chunk['Type'] == 'Goods') | df_iter_chunk['Commodity_ID'].isin(list_services)) & \
                                  (df_iter_chunk['Reporter'] != 'World') & \
                                  (df_iter_chunk['Partner'] != 'World')]\
                               .drop('Type', axis = 1)    
    print(num_iter_number, ': Filtering performed')    
    ser_iter_chunk = df_iter_chunk.set_index(['Date', 'Reporter', 'Partner', 'Commodity_ID']).squeeze().sort_index()
    del df_iter_chunk
    gc.collect()
    list_export_chunks.append(ser_iter_chunk)
    print(num_iter_number, ': Chunk added to container')    
ser_bilateral_export = pd.concat(list_export_chunks, axis = 0, sort = False).sort_index()
ser_bilateral_export.name = 'Export'
del list_export_chunks
gc.collect()

76

In [None]:
### BILATERAL IMPORT DATA LOADING TO PERFORM FACTOR CALCULATION

gc.collect()
list_import_chunks = []
for num_iter_number, ser_iter_chunk in enumerate(pd.read_hdf(str_path_import_bilateral, key = str_key_unc_import, chunksize = 1000000)):
    gc.collect()
    print(num_iter_number, ': Extraction started')
    ser_iter_chunk = ser_iter_chunk[ser_iter_chunk > 0.0].astype('int32')
    df_iter_chunk = ser_iter_chunk.reset_index()
    df_iter_chunk = df_iter_chunk[(df_iter_chunk['Reporter'] != df_iter_chunk['Partner']) & \
                                  ((df_iter_chunk['Type'] == 'Goods') | df_iter_chunk['Commodity_ID'].isin(list_services)) & (df_iter_chunk['Reporter'] != 'World') & \
                                  (df_iter_chunk['Partner'] != 'World')]\
                               .drop('Type', axis = 1)     
    print(num_iter_number, ': Filtering performed')    
    ser_iter_chunk = df_iter_chunk.set_index(['Date', 'Reporter', 'Partner', 'Commodity_ID']).squeeze().sort_index()
    del df_iter_chunk
    gc.collect()
    list_import_chunks.append(ser_iter_chunk)
    print(num_iter_number, ': Chunk added to container')    
ser_bilateral_import = pd.concat(list_import_chunks, axis = 0, sort = False).sort_index()
ser_bilateral_import.name = 'Import'
del list_import_chunks
gc.collect()

In [14]:
### REPORTER / COMMODITY BY DATE TOTAL EXPORT & IMPORT & TRADE

gc.collect()
### Export totals:
ser_country_comm_export = ser_bilateral_export.groupby(['Date', 'Reporter', 'Commodity_ID']).sum().dropna()
ser_country_comm_export.name = 'Export'
### Import totals:
ser_country_comm_import = ser_bilateral_import.groupby(['Date', 'Reporter', 'Commodity_ID']).sum().dropna()
ser_country_comm_import.name = 'Import'

In [15]:
### IMF EXPORT TRADE DATA LOADING

gc.collect()

ser_trade_data = pd.read_hdf(path_or_buf = str_path_imf_trade, key = str_key_imf_trade).reorder_levels(['Date', 'Country', 'SITC_ID']).sort_index()
#ser_trade_data = ser_trade_data.to_frame().join(ser_ison_status).set_index('Market', append = True).squeeze()
ser_trade_data = ser_trade_data.drop('ag', axis = 0, level = 'SITC_ID') / 1000
ser_trade_data.name = 'Trade_Value'

In [16]:
### IMF EXPORT QUALITY DATA LOADING

gc.collect()
ser_quality_data = pd.read_hdf(path_or_buf = str_path_imf_quality, key = str_key_imf_eq).reorder_levels(['Date', 'Country', 'SITC_ID']).sort_index()
#ser_quality_data = ser_quality_data.to_frame().join(ser_ison_status).set_index('Market', append = True).squeeze()
ser_quality_data = ser_quality_data.drop('ag', axis = 0, level = 'SITC_ID')

In [17]:
### IMF DATA CONNECTION:

df_quality_data = ser_quality_data.to_frame().join(ser_trade_data)

In [18]:
### HS TO SITC CONVERSION MAP LOADING

df_raw_conversion = pd.read_excel(engine = 'openpyxl', io = str_path_commodity_map_xlsx, sheet_name = str_page_map, header = 0, index_col = None,
                                 na_values = list_na_excel_values, keep_default_na = False, dtype = str)
df_comm_conversion = df_raw_conversion.set_index('Commodity ID')[['SITC Correspondent 1', 'SITC Correspondent 2', 'SITC Correspondent 3']]
ser_comm_conversion = df_comm_conversion.stack().droplevel(-1)
ser_comm_conversion.index.names = ['Commodity_ID']
ser_comm_conversion.name = 'SITC_ID'

In [19]:
### CONVERTING FROM SITC CLASSIFICATION TO HS CLASSIFICATION AND ADDING COMTRADE EXPORT FLOWS

gc.collect()

def conditional_average(df_group):
    if (len(df_group) > 1):
        flo_result = weighted_average(df_group['Quality'], df_group['Trade_Value'])
    else:
        flo_result = df_group['Quality'].values[0]
    return flo_result

### Adding mapping table:
df_quality_comtrade = df_quality_data.join(ser_comm_conversion.to_frame().set_index('SITC_ID', append = True)).reset_index('Commodity_ID')\
                                     .dropna(subset = ['Commodity_ID']).set_index(['Commodity_ID'], append = True)\
                                     .reorder_levels(['Date', 'Country', 'Commodity_ID', 'SITC_ID']).sort_index()
### Performoing weighted average for "n -> 1" conversions:
df_quality_comtrade = df_quality_comtrade.groupby(['Date', 'Country', 'Commodity_ID']).apply(conditional_average).to_frame()
df_quality_comtrade.index.set_names('Reporter', level = 'Country', inplace = True)
df_quality_comtrade.columns = ['Quality']
### Adding ComTrade Export flows:
df_quality_comtrade = df_quality_comtrade.join(ser_country_comm_export, how = 'right').dropna(subset = ['Export'])
### Adding regional connections:
ser_ison_status.index.name = 'Reporter'
df_quality_comtrade = df_quality_comtrade.join(ser_ison_status).dropna(subset = ['Market']).set_index('Market', append = True)
ser_ison_status.index.name = 'Country'

In [None]:
### INTERACTION VARIABLE PREPARATION

gc.collect()
### GDP annualization:
ser_gdp = pd.read_hdf(str_path_gdp)
ser_gdp_ann = (ser_gdp.unstack('Country').resample('BM').bfill() / 12).rolling(12).sum().loc[date_start: date_end].stack('Country')
ser_gdp_ann.index.names = ['Date', 'Reporter']
### Total export annualization:
ser_country_comm_export_add = ser_country_comm_export.groupby(['Reporter', 'Commodity_ID'], group_keys = False)\
                                    .apply(lambda ser_group: ser_group.append(pd.Series(ser_group[0], 
                                    index = [(ser_group.index[0][0] - pd.offsets.BYearEnd(), ser_group.index[0][1], ser_group.index[0][2])])))\
                                    .sort_index()
ser_country_comm_exp_ann = ser_country_comm_export_add.groupby(['Date', 'Reporter']).sum()
ser_country_comm_exp_ann = (ser_country_comm_exp_ann.unstack('Reporter').resample('BM').bfill() / 12).rolling(12).sum().loc[date_start: date_end].stack('Reporter')
ser_country_comm_exp_ann.index.names = ['Date', 'Reporter']
gc.collect()
### Total qualified export annualization:
list_qualified_comm = sorted(df_quality_comtrade.dropna().index.get_level_values('Commodity_ID').unique())
ser_country_comm_export_qual = ser_country_comm_export_add.loc[:, :, list_qualified_comm]
ser_country_comm_exp_ann_qual = ser_country_comm_export_qual.groupby(['Date', 'Reporter']).sum()
ser_country_comm_exp_ann_qual = (ser_country_comm_exp_ann_qual.unstack('Reporter').resample('BM').bfill() / 12).rolling(12).sum()\
                                                              .loc[date_start: date_end].stack('Reporter')
ser_country_comm_exp_ann_qual.index.names = ['Date', 'Reporter']
gc.collect()
### Total import annualization:
ser_country_comm_import_add = ser_country_comm_import.groupby(['Reporter', 'Commodity_ID'], group_keys = False)\
                                    .apply(lambda ser_group: ser_group.append(pd.Series(ser_group[0], 
                                    index = [(ser_group.index[0][0] - pd.offsets.BYearEnd(), ser_group.index[0][1], ser_group.index[0][2])])))\
                                    .sort_index()
ser_country_comm_imp_ann = ser_country_comm_import_add.groupby(['Date', 'Reporter']).sum()
ser_country_comm_imp_ann = (ser_country_comm_imp_ann.unstack('Reporter').resample('BM').bfill() / 12).rolling(12).sum().loc[date_start: date_end].stack('Reporter')
ser_country_comm_imp_ann.index.names = ['Date', 'Reporter']
gc.collect()
### Saving results:
df_interaction = pd.concat([ser_gdp_ann, ser_country_comm_exp_ann, ser_country_comm_exp_ann_qual, ser_country_comm_imp_ann], axis = 1)
df_interaction.columns = ['GDP_annualized', 'Export_annualized', 'Qualified_Export_annualized', 'Import_annualized']
df_interaction = df_interaction.loc[(All, ser_ison_status.index), :]
df_interaction.to_hdf(str_path_interact_hdf, str_key_interact_var)
df_interaction.to_csv(str_path_interact_csv, sep = ';', header = True)

In [52]:
### TEMP

pd.read_csv(str_path_interact_var, sep = ';').dropna()

Unnamed: 0,Date,Reporter,GDP_annualized,Export_annualized,Qualified_Export_annualized
1,1989-12-29,AE,4.146500e+10,8.875251e+06,8.852485e+06
8,1989-12-29,AR,7.663690e+10,2.664753e+06,2.653852e+06
9,1989-12-29,AT,1.331058e+11,4.901008e+06,4.887679e+06
10,1989-12-29,AU,2.998779e+11,2.922174e+07,2.908642e+07
13,1989-12-29,BD,2.878171e+10,1.126162e+06,1.124362e+06
...,...,...,...,...,...
83673,2021-12-31,UG,4.052979e+10,2.494386e+06,2.487020e+06
83674,2021-12-31,US,2.331508e+13,2.141827e+09,1.701382e+09
83681,2021-12-31,VN,3.661376e+11,3.331721e+08,3.326549e+08
83686,2021-12-31,ZA,4.190150e+11,1.139845e+08,1.121712e+08
