In [1]:
### RUN EVERY TIME: GRAVITY DATASETS EXPLORING

In [2]:
### RUN EVERY TIME: INITIALIZATION

import pandas as pd
import numpy as np
import math
import gc
import datetime
import os

In [3]:
### RUN EVERY TIME: VERSION CONTROL

from platform import python_version
print('pandas version: ', pd.__version__)
print('python version: ', python_version())

pandas version:  0.25.3
python version:  3.7.4


In [4]:
### RUN EVERY TIME: MAIN CONSTANTS

### MultiIndex level slice constant:
All = slice(None)
### Regions list:
list_regions = ['DM', 'EM', 'FM']
### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### Activities naming:
dict_activity = {}
dict_activity['imf_dots'] = 'Trade Export'
dict_activity['imf_cpis'] = 'Portfolio Investment'
dict_activity['oecd_fdi'] = 'Direct Investment'
dict_activity['bis_lbs'] = 'Bank Lending'
dict_activity['gravity'] = 'Gravity'
### CEPII dataset:
str_path_cepii_dataset = 'Data_Files/Source_Files/cepii_dataset.h5'
str_distance_dataset = 'distance_dataset'
### WB WDI GDP dataset:
str_path_wb_gdp_dataset = 'Data_Files/Source_Files/gdp_dataset.h5'
str_wb_gdp_dataset = 'gdp_dataset'
### BIS Loans dataset:
str_path_bis_lbs_combined = 'Data_Files/Source_Files/bis_combined.h5'
str_full_bis_lbs_combined = 'bis_full_combined'
### IMF CPIS dataset:
str_path_imf_cpis_combined = 'Data_Files/Source_Files/cpis_combined.h5'
str_full_imf_cpis_combined = 'cpis_full_combined'
### Filtered IMF CPIS dataset:
str_path_imf_cpis_filtered = 'Data_Files/Source_Files/cpis_filtered.h5'
str_key_imf_cpis_filtered = 'cpis_filtered'
### IMF DOTS datasets:
str_path_imf_dots_combined = 'Data_Files/Source_Files/dots_combined.h5'
str_full_imf_dots_combined = 'dots_full_combined'
str_path_imf_dots_world = 'Data_Files/Source_Files/dots_world_export.h5'
str_full_imf_dots_world = 'dots_world_export'
### OECD FDI dataset:
str_path_oecd_fdi_combined = 'Data_Files/Source_Files/oecd_combined.h5'
str_full_oecd_fdi_combined = 'oecd_full_combined'
str_path_direct_out_net = 'Data_Files/Source_Files/direct_outward_net.h5'
str_key_direct_out_net = 'direct_outward'
### Technical Constants:
date_start = pd.Timestamp('1989-12-29')
str_date_end = '2021-12-31'
date_end = pd.Timestamp(str_date_end)
date_ison = pd.Timestamp('1994-12-31')
### Distance power for gravity calculation:
flo_dist_power = 1 / 2
### Bloomberg structured data extraction parameters:
str_path_bb_hdf = 'Data_Files/Source_Files/Bloomberg_prepared.h5'
str_key_ret_daily = 'bb_ret_daily'
str_ret_daily_csv_path = 'Data_Files/Source_Files/ret_daily.csv'
### Saved annualized activities:
str_path_act_annualized = 'Data_Files/Source_Files/datasets_annualized.h5'
str_path_act_weights = 'Data_Files/Source_Files/datasets_weights.h5'
str_path_world_export_annualized = 'Data_Files/Source_Files/world_export_annualized.h5'
str_key_world_export_annualized = 'world_export_ann'
### Herfindahl index threshold:
flo_hi_threshold = 0.0 # 1.1
### Saved herfindahl values:
str_path_herfindahl = 'Data_Files/Source_Files/herfindahl_indices.h5'
### Returns average parameters:
int_ave_months = 6
int_halflife_months = 2
### Thresholds for weighted cross-sectional average returns calculation:
int_select_top = 3
flo_select_share = 0.05
### Saved weighted returns:
str_path_ret_weighted = 'Data_Files/Source_Files/returns_weighted.h5'
str_key_weighted = 'ret_weighted'
### Saved export data:
str_path_gravity_results = 'Data_Files/Source_Files/gravity_export.h5'
str_key_activity_sum = 'activity_sum'
str_key_activity_share = 'activity_share'
str_key_gdp_total = 'gdp_total'
str_key_herfindahl = 'herfindahl_index'
str_key_openess = 'openess_measure'
str_key_ret_weighted = 'ret_weighted'
### CSV to Export data:
str_activity_sum_csv_path = 'Data_Files/Test_Files/activity_sum.csv'
str_activity_share_csv_path = 'Data_Files/Test_Files/activity_share.csv'
str_gdp_total_csv_path = 'Data_Files/Test_Files/gdp_total.csv'
str_herfindahl_csv_path = 'Data_Files/Test_Files/herfindahl.csv'
str_openess_csv_path = 'Data_Files/Test_Files/openess.csv'
str_ret_weighted_csv_path = 'Data_Files/Test_Files/ret_weighted.csv'

In [5]:
### DEFINING EXPONENTIAL WEIGHT

def exp_weight_single(halflife_len = 3, num_element = 0):
    ### Weight calculating:
    num_period_factor = math.exp(math.log(0.5) / round(halflife_len))
    num_weight = np.exp(math.log(num_period_factor) * num_element)
    ### Result output:
    return num_weight

In [6]:
### DEFINING WEIGHTED AVERAGE

def weighted_average(ser_data, ser_weight = False, int_min_count = 0):
    ### Default output:
    num_result = np.NaN
    ### Checking for data presence:
    if (ser_data.count() > int_min_count):       
        ### Checking for weights dataset:
        if isinstance(ser_weight, bool):
            ### Calculating of simple average:
            num_result = np.nanmean(ser_data.values)
        else:
            ### Weights filtering:
            list_weight = ser_weight[ser_data.dropna().index].values
            ### Checking for weights presence:
            if np.nansum(list_weight):
                ### Data filtering:
                list_data = ser_data.dropna().values
                ### Weighted average calculating:
                num_result = np.nansum(list_data * list_weight) / np.nansum(list_weight)
    ### Results output:
    return num_result

In [7]:
### DEFINING COUNTRY CODES EXTRACTOR

def get_country_codes(use_local_copy = False):  
    ### In case if URL is unavailable:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/countrycode.html'
    ### Online extraction:
    else:
        url_country_code = 'https://countrycode.org/'
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    df_result = df_full_codes[['ISO SHORT', 'ISO LONG']].sort_index()    
    df_result.index = df_result.index.str.upper()
    ### Results output:
    return df_result

In [8]:
### DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE (TO BE IGNORED IN PRODUCT CODE)

def ison_membership_converting(str_path_universe, date_end, bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(engine = 'openpyxl', io = str_path_universe, sheet_name = 'Switchers', header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

In [12]:
### RUN EVERY TIME: DATASETS LOADING

dict_dataset = {}
dict_dataset['imf_dots'] = pd.read_hdf(path_or_buf = str_path_imf_dots_combined, key = str_full_imf_dots_combined).droplevel('Market').sort_index()['Export_Augmented']
dict_dataset['imf_cpis'] = pd.read_hdf(path_or_buf = str_path_imf_cpis_combined, key = str_full_imf_cpis_combined).droplevel('Market').sort_index()['Asset_Augmented']
#dict_dataset['imf_cpis'] = pd.read_hdf(path_or_buf = str_path_imf_cpis_filtered, key = str_key_imf_cpis_filtered)
#dict_dataset['imf_cpis'].loc[dict_dataset['imf_cpis'] < 0.0] = np.NaN
#dict_dataset['oecd_fdi'] = pd.read_hdf(path_or_buf = str_path_oecd_fdi_combined, key = str_full_oecd_fdi_combined).droplevel('Market').sort_index()['Asset']
#dict_dataset['oecd_fdi'].loc[dict_dataset['oecd_fdi'] < 0.0] = np.NaN
#dict_dataset['oecd_fdi'] = pd.read_hdf(path_or_buf = str_path_direct_out_net, key = str_key_direct_out_net)
#dict_dataset['oecd_fdi'].loc[dict_dataset['oecd_fdi'] < 0.0] = np.NaN
#dict_dataset['bis_lbs'] = pd.read_hdf(path_or_buf = str_path_bis_lbs_combined, key = str_full_bis_lbs_combined)\
#    .set_index(['Date', 'Reporter', 'Partner']).sort_index()['Claim_Augmented']

gc.collect()

133

In [14]:
### RUN EVERY TIME: GRAVITY DATASET CONSTRUCTION

### GDP loading:
ser_gdp = pd.read_hdf(path_or_buf = str_path_wb_gdp_dataset, key = str_wb_gdp_dataset)
### Distances loading:
ser_dist = pd.read_hdf(path_or_buf = str_path_cepii_dataset, key = str_distance_dataset)['distw']
### Distances naming:
ser_dist.index.names = ['Reporter', 'Partner']
ser_dist.name = 'Distance'
### Dropping internal distances:
df_dist = ser_dist.reset_index()
df_dist.drop(df_dist[df_dist['Reporter'] == df_dist['Partner']].index, inplace = True)
ser_dist = df_dist.set_index(['Reporter', 'Partner']).squeeze().sort_index()
### GDP duplicating:
ser_gdp_reporter = ser_gdp[:]
ser_gdp_reporter.index.names = ['Date', 'Reporter']
ser_gdp_reporter.name = 'GDP_Reporter'
ser_gdp_partner = ser_gdp[:]
ser_gdp_partner.index.names = ['Date', 'Partner']
ser_gdp_partner.name = 'GDP_Partner'
### Reporters data connecting:
df_reporter = ser_dist.to_frame().join(ser_gdp_reporter).sort_index()
### Partners data connecting:
df_partner = ser_dist.to_frame().join(ser_gdp_partner).drop('Distance', axis = 1).sort_index()
df_partner = df_partner.reorder_levels([1, 0, 2])
### Joining data and Gravity calculation:
df_gravity = pd.concat([df_reporter, df_partner], axis = 1)
df_gravity = df_gravity.reset_index('Date').dropna(subset = ['Date']).set_index('Date', append = True).reorder_levels([2, 0, 1]).sort_index()
display(df_gravity.loc[('2020-12-31', ['US', 'IL'], ['US', 'IL']), :])
ser_gravity = (df_gravity['GDP_Reporter'] / 10 ** 9) * (df_gravity['GDP_Partner'] / 10 ** 9) / (df_gravity['Distance'] ** flo_dist_power)
### Adding gravity to activities:
dict_dataset['gravity'] = ser_gravity.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Distance,GDP_Reporter,GDP_Partner
Date,Reporter,Partner,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-12-31,IL,US,10516.0,413267700000.0,21060470000000.0
2020-12-31,US,IL,10516.0,21060470000000.0,413267700000.0


In [11]:
### TEMP

gc.collect()
dict_annualized = {}
def annualize(ser_group):
    ser_ann = ser_group.droplevel(['Reporter', 'Partner'])
    ### Filter not empty pairs:
    if (ser_ann.count() > 0):
        ### Data frequency calculation (observations per year):
        ser_freq = ser_ann.dropna().resample('BY').count()
        int_freq = int(ser_freq[ser_freq > 0].median())
        ### Data periodicity definition (months number covered per one observation):
        int_period = 12 // int_freq
        ### Convert data to monthly frequency:
        if (int_period > 1):
            ### Prepending one more period to backfill first valid observation:
            ser_ann = ser_ann.append(pd.Series(np.NaN, index = [ser_ann.index[0] - pd.offsets.BMonthEnd(int_period)])).sort_index()    
            ### Replace period value to monthly value:
            ser_ann = (ser_ann / int_period)
            ### Backfill monthly value for a whole period:
            ser_ann = ser_ann.resample('BM').bfill(limit = int_period)
            ### Drop dummy observation:
            ser_ann.drop(ser_ann.index[0], inplace = True)
        ### Annualize data:
        ser_ann = ser_ann.rolling(window = 12, min_periods = 12).sum()
    ### Results output:
    return ser_ann

dict_dataset['bis_lbs'].groupby(['Reporter', 'Partner']).apply(annualize).astype('float32').reorder_levels([2, 0, 1]).sort_index()

            Reporter  Partner
1989-04-28  CI        FI              NaN
1989-05-31  CI        FI              NaN
1989-06-30  CI        FI              NaN
1989-07-31  CI        FI              NaN
1989-08-31  CI        FI              NaN
                                   ...   
2022-03-31  ZM        PH            0.307
                      SE            4.323
                      TW            0.559
                      US         3721.000
                      ZA          960.000
Name: Claim_Augmented, Length: 886292, dtype: float32

In [11]:
### RUN EVERY TIME: ACTIVITIES ANNUALIZATION

gc.collect()
dict_annualized = {}
def annualize(ser_group):
    ser_ann = ser_group.droplevel(['Reporter', 'Partner'])
    ### Filter not empty pairs:
    if (ser_ann.count() > 0):
        ### Data frequency calculation (observations per year):
        ser_freq = ser_ann.dropna().resample('BY').count()
        int_freq = int(ser_freq[ser_freq > 0].median())
        ### Data periodicity definition (months number covered per one observation):
        int_period = 12 // int_freq
        ### Convert data to monthly frequency:
        if (int_period > 1):
            ### Prepending one more period to backfill first valid observation:
            ser_ann = ser_ann.append(pd.Series(np.NaN, index = [ser_ann.index[0] - pd.offsets.BMonthEnd(int_period)])).sort_index()    
            ### Replace period value to monthly value:
            ser_ann = (ser_ann / int_period)
            ### Backfill monthly value for a whole period:
            ser_ann = ser_ann.resample('BM').bfill(limit = int_period - 1)
            ### Drop dummy observation:
            ser_ann.drop(ser_ann.index[0], inplace = True)
        ### Annualize data:
        ser_ann = ser_ann.rolling(window = 12, min_periods = 12).sum()
    ### Results output:
    return ser_ann
### Deleting existing file with annualized data:
if os.path.exists(str_path_act_annualized):
    os.remove(str_path_act_annualized)
    print(str_path_act_annualized, 'File removed')
### Looping over activities:
for iter_dataset in dict_dataset:
#for iter_dataset in ['imf_cpis', 'oecd_fdi']:
    gc.collect()
    print(iter_dataset, ': annualizing started')    
    ser_iter = dict_dataset[iter_dataset]
    ser_iter_ann = ser_iter.groupby(['Reporter', 'Partner']).apply(annualize).astype('float32').reorder_levels([2, 0, 1]).sort_index()
    ser_iter_ann.index.names = ['Date', 'Reporter', 'Partner']
    ser_iter_ann.name = iter_dataset + '_ann'
    ser_iter_ann.to_hdf(str_path_act_annualized, iter_dataset, mode = 'a', format = 'table')
    print(iter_dataset, ': annualizing done')    
#    break

Data_Files/Source_Files/datasets_annualized.h5 File removed
imf_dots : annualizing started
imf_dots : annualizing done
imf_cpis : annualizing started
imf_cpis : annualizing done
oecd_fdi : annualizing started
oecd_fdi : annualizing done
gravity : annualizing started
gravity : annualizing done


In [9]:
### RUN EVERY TIME: WEIGHTED AVERAGE OF RETURNS CALCULATION: DATA PREPARATION

gc.collect()
### Lags initialization:
dict_lag = {}
dict_lag['imf_dots'] = 3
dict_lag['imf_cpis'] = 6
dict_lag['oecd_fdi'] = 24
#dict_lag['bis_lbs'] = 6
dict_lag['gravity'] = 9
### Periods to fill initialization:
dict_ffill = {}
dict_ffill['imf_dots'] = 1
dict_ffill['imf_cpis'] = 12
dict_ffill['oecd_fdi'] = 12
#dict_ffill['bis_lbs'] = 3
dict_ffill['gravity'] = 12
### Annualized data loading:
dict_annual = {}
dict_annual['imf_dots'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'imf_dots')
dict_annual['imf_cpis'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'imf_cpis')
dict_annual['oecd_fdi'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'oecd_fdi')
#dict_annual['bis_lbs'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'bis_lbs')
dict_annual['gravity'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'gravity')
### Returns loading & shifting:
### HDF version:
#ser_ret_usd = pd.read_hdf(str_path_bb_hdf, str_key_ret_daily).loc['USD']
### CSV version:
df_ret_usd = pd.read_csv(str_ret_daily_csv_path, header = None, sep = ';', parse_dates = [0])
df_ret_usd.columns = ['Date', 'Country', 'Value']
ser_ret_usd = df_ret_usd.set_index(['Date', 'Country']).squeeze()
###
ser_ret_shifted = ser_ret_usd.groupby('Country').shift()
ser_ret_shifted.index.names = ['Date', 'Partner']
ser_ret_shifted.name = 'ret_usd'

In [10]:
### RUN EVERY TIME: WEIGHTED AVERAGE OF RETURNS CALCULATION: AVERAGE RETURNS PREPARATION

gc.collect()
### Weighted mean for end-of-bmonth dates defining:
def get_weighted_mean(ser_group):
    ser_values = ser_group.droplevel('Partner')
    if (ser_values.index[-1] == ser_values.index[-1] + pd.offsets.BMonthEnd(0)):
        ser_weights = pd.Series(list_weight[-len(ser_values.index) : ], ser_values.index)
        flo_result = weighted_average(ser_values, ser_weights)
    else:
        flo_result = np.NaN
    return flo_result
### Weights defining:
list_weight = list(map(lambda iter_num: exp_weight_single(int_halflife_months * 22, iter_num), range(int_ave_months * 22)))[::-1]
### Mean returns calculation:
#ser_test = ser_ret_shifted.loc[:, ['US', 'IL']]#[-132 : ]
ser_ret_ave = ser_ret_shifted.groupby('Partner', group_keys = False)\
                             .rolling(window = int_ave_months * 22, min_periods = int_ave_months * 22 // 2)\
                             .apply(get_weighted_mean, raw = False).dropna()

In [36]:
### RUN EVERY TIME: GDP & OPENESS MEASURE ANNUALIZATION

### Annualization of differewnt frequency data definition:
def annualize(ser_group):
    ser_ann = ser_group.droplevel(['Country'])
    ### Filter not empty pairs:
    if (ser_ann.count() > 0):
        ### Data frequency calculation (observations per year):
        ser_freq = ser_ann.dropna().resample('BY').count()
        int_freq = int(ser_freq[ser_freq > 0].median())
        ### Data periodicity definition (months number covered per one observation):
        int_period = 12 // int_freq
        ### Convert data to monthly frequency:
        if (int_period > 1):
            ### Prepending one more period to backfill first valid observation:
            ser_ann = ser_ann.append(pd.Series(np.NaN, index = [ser_ann.index[0] - pd.offsets.BMonthEnd(int_period)])).sort_index()    
            ### Replace period value to monthly value:
            ser_ann = (ser_ann / int_period)
            ### Backfill monthly value for a whole period:
            ser_ann = ser_ann.resample('BM').bfill(limit = int_period - 1)
            ### Drop dummy observation:
            ser_ann.drop(ser_ann.index[0], inplace = True)
        ### Annualize data:
        ser_ann = ser_ann.rolling(window = 12, min_periods = 12).sum()
    ### Results output:
    return ser_ann
### GDP Loading:
ser_gdp = pd.read_hdf(path_or_buf = str_path_wb_gdp_dataset, key = str_wb_gdp_dataset)
### GDP annualization:
ser_gdp_ann = (ser_gdp.groupby(['Country']).apply(annualize).astype('float32') / 1000000).reorder_levels([1, 0]).sort_index()
ser_gdp_ann.index.names = ['Date', 'Country']
ser_gdp_ann.to_hdf(str_path_gravity_results, str_key_gdp_total, mode = 'a')
### Export data loading & converting:
ser_export_ann = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'imf_dots')
### Added to exclude trade bilaterals for partners that don't have returns data:
idx_ret_exist = ser_ret_ave.dropna().index
ser_export_ann = ser_export_ann.groupby('Reporter').apply(lambda ser_country: ser_country.droplevel('Reporter').reindex(idx_ret_exist))\
                               .reorder_levels(['Date', 'Reporter', 'Partner']).sort_index()
### Export sum calculation:
ser_export_sum = ser_export_ann.groupby(['Date', 'Reporter']).sum()
ser_export_sum.index.names = ['Date', 'Country']
### Openess measure saving:
(ser_export_sum / ser_gdp_ann).to_hdf(str_path_gravity_results, str_key_openess, mode = 'a')

In [39]:
### RUN EVERY TIME: WORLD EXPORT ANNUALIZATION

### Annualization of different frequency data definition:
def annualize(ser_group):
    ser_ann = ser_group.droplevel(['Reporter'])
    ### Filter not empty pairs:
    if (ser_ann.count() > 0):
        ### Data frequency calculation (observations per year):
        ser_freq = ser_ann.dropna().resample('BY').count()
        int_freq = int(ser_freq[ser_freq > 0].median())
        ### Data periodicity definition (months number covered per one observation):
        int_period = 12 // int_freq
        ### Convert data to monthly frequency:
        if (int_period > 1):
            ### Prepending one more period to backfill first valid observation:
            ser_ann = ser_ann.append(pd.Series(np.NaN, index = [ser_ann.index[0] - pd.offsets.BMonthEnd(int_period)])).sort_index()    
            ### Replace period value to monthly value:
            ser_ann = (ser_ann / int_period)
            ### Backfill monthly value for a whole period:
            ser_ann = ser_ann.resample('BM').bfill(limit = int_period - 1)
            ### Drop dummy observation:
            ser_ann.drop(ser_ann.index[0], inplace = True)
        ### Annualize data:
        ser_ann = ser_ann.rolling(window = 12, min_periods = 12).sum()
    ### Results output:
    return ser_ann
### World Export Loading:
ser_world_export = pd.read_hdf(path_or_buf = str_path_imf_dots_world, key = str_full_imf_dots_world)
### World Export annualization:
ser_world_export_ann = (ser_world_export.groupby(['Reporter']).apply(annualize).astype('float32')).reorder_levels([1, 0]).sort_index()
ser_world_export_ann.index.names = ['Date', 'Reporter']
ser_world_export_ann.to_hdf(str_path_world_export_annualized, str_key_world_export_annualized, mode = 'w')

In [64]:
### RUN EVERY TIME: CROSS SECTIONAL ACTIVITY TOTALS & WEIGHTS BY COUNTRY SAVING

gc.collect()
### Annualized data loading:
dict_annual = {}
dict_annual['imf_dots'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'imf_dots')
dict_annual['imf_cpis'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'imf_cpis')
dict_annual['oecd_fdi'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'oecd_fdi')
#dict_annual['bis_lbs'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'bis_lbs')
#dict_annual['gravity'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'gravity')
### X-sectional sum by Country:
dict_country_sum = {}
for iter_activity in dict_annual:
    ### Modified to exlcude partners that don't have returns:
    idx_ret_exist = ser_ret_ave.dropna().index
    ser_iter_activity = dict_annual[iter_activity]
    ser_iter_activity = ser_iter_activity.groupby('Reporter').apply(lambda ser_country: ser_country.droplevel('Reporter').reindex(idx_ret_exist))\
                                         .reorder_levels(['Date', 'Reporter', 'Partner']).sort_index()    
    df_iter_sum = ser_iter_activity.groupby(['Date', 'Reporter']).sum().reset_index('Reporter')
#    df_iter_sum = dict_annual[iter_activity].groupby(['Date', 'Reporter']).sum().reset_index('Reporter')
    df_iter_sum['Reporter'] = df_iter_sum['Reporter'].astype(str)
    ser_iter_sum = df_iter_sum.set_index('Reporter', append = True).squeeze().sort_index()
    ser_iter_sum.name = 'Volume'
    dict_country_sum[iter_activity] = ser_iter_sum
#    break
### Reporter data aggregation:
ser_country_sum = pd.concat(dict_country_sum, axis = 0, sort = False, names = ['Activity'])
ser_country_sum.reorder_levels([1, 2, 0]).to_hdf(str_path_gravity_results, key = str_key_activity_sum, mode = 'a')
ser_country_share = ser_country_sum.loc[list(dict_country_sum.keys())[:-1]].groupby(['Date', 'Reporter']).apply(lambda ser_group: ser_group / ser_group.sum())
ser_country_share.reorder_levels([1, 2, 0]).to_hdf(str_path_gravity_results, key = str_key_activity_share, mode = 'a')

Closing remaining open files:Data_Files/Source_Files/datasets_annualized.h5...done


In [12]:
### RUN EVERY TIME: HERFINDAHL INDEX CALСULATION

gc.collect()
### Annualized data loading:
dict_annual = {}
dict_annual['imf_dots'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'imf_dots')
dict_annual['imf_cpis'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'imf_cpis')
dict_annual['oecd_fdi'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'oecd_fdi')
#dict_annual['bis_lbs'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'bis_lbs')
dict_annual['gravity'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'gravity')

### Defining Herfindahl index calculation:
def get_herfindahl(ser_group):
    if (ser_group.count() > 0):
        ser_norm = ser_group / ser_group.sum()
        flo_herfindahl = 1 / ((ser_norm ** 2).sum() ** (1 / 2))
    else:
        flo_herfindahl = np.NaN
    return flo_herfindahl
### Deleting existing file with annualized data:
if os.path.exists(str_path_herfindahl):
    os.remove(str_path_herfindahl)
    print(str_path_herfindahl, 'File removed')
### Looping over activities datasets:
for iter_dataset in dict_annual:
#for iter_dataset in ['imf_cpis']:    
    ### Herfindahl index calculation:
    ser_herfindahl_full = dict_annual[iter_dataset].groupby(['Date', 'Reporter']).apply(get_herfindahl)
    ser_herfindahl_full.to_hdf(str_path_herfindahl, iter_dataset, mode = 'a', format = 'table')

Data_Files/Source_Files/herfindahl_indices.h5 File removed


In [16]:
### RUN EVERY TIME: ADDING HERFINDAHL INDICES TO DATA COLLECTION

dict_herfindahl = {}
for iter_activity in dict_activity:
    ser_iter_herfindahl = pd.read_hdf(str_path_herfindahl, iter_activity)
    df_iter_herfindahl = ser_iter_herfindahl.reset_index('Reporter')
    df_iter_herfindahl['Country'] = df_iter_herfindahl['Reporter'].astype(str)
    dict_herfindahl[iter_activity] = df_iter_herfindahl.drop('Reporter', axis = 1).set_index('Country', append = True).squeeze()
ser_full_herfindahl = pd.concat(dict_herfindahl, axis = 0).reorder_levels([1, 2, 0])
ser_full_herfindahl.index.names = ['Date', 'Country', 'Activity']
ser_full_herfindahl.to_hdf(str_path_gravity_results, str_key_herfindahl, mode = 'a')

In [17]:
### RUN EVERY TIME: REPLACING VALUES WITH WEIGHTS WHILE CONTROLLING HERFINDAL INDEX VALUE

gc.collect()
### Annualized data loading:
dict_annual = {}
dict_annual['imf_dots'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'imf_dots')
dict_annual['imf_cpis'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'imf_cpis')
dict_annual['oecd_fdi'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'oecd_fdi')
dict_annual['bis_lbs'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'bis_lbs')
dict_annual['gravity'] = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'gravity')
### Weights container:
dict_weights = {}
### Defining Herfindahl index calculation:
def set_weights(ser_group, flo_limit = 1.0):
    if (ser_group.count() > 0):
        ser_norm = ser_group / ser_group.sum()
        flo_herfindahl = 1 / ((ser_norm ** 2).sum() ** (1 / 2))
        if (flo_herfindahl < flo_limit):
            ser_weights = pd.Series(np.NaN, index = ser_group.index)
        else:
            ser_weights = ser_norm
    else:
        ser_weights = ser_group
    return ser_weights
### Deleting existing file with weights:
if os.path.exists(str_path_act_weights):
    os.remove(str_path_act_weights)
    print(str_path_act_weights, 'File removed')
### Looping over activities datasets:
for iter_dataset in dict_annual:
#for iter_dataset in ['imf_cpis']:    
    ### Weights calculation:
    gc.collect()
    print(iter_dataset, ': weights calculation started')    
    ser_iter_weights = dict_annual[iter_dataset].groupby(['Date', 'Reporter']).apply(set_weights, flo_hi_threshold)
    ser_iter_weights.to_hdf(str_path_act_weights, iter_dataset, mode = 'a', format = 'table')
    print(iter_dataset, ': weights calculation done')
#    break

Data_Files/Source_Files/datasets_weights.h5 File removed
imf_dots : weights calculation started
imf_dots : weights calculation done
imf_cpis : weights calculation started
imf_cpis : weights calculation done
oecd_fdi : weights calculation started
oecd_fdi : weights calculation done
bis_lbs : weights calculation started
bis_lbs : weights calculation done
gravity : weights calculation started
gravity : weights calculation done


In [13]:
### RUN EVERY TIME: WEIGHTED AVERAGE OF RETURNS CALCULATION

gc.collect()
### Weighted returns calculator:
def get_weighted_ret(df_group, int_select_top = None, flo_select_share = None):
    df_group = df_group.dropna()
    if (int_select_top is not None):
        df_group = df_group.sort_values('Weight', ascending = False)[ : int_select_top]
    if (flo_select_share is not None):
        df_group = df_group[df_group['Weight'] > flo_select_share]
    flo_weighted = np.NaN
    if (len(df_group.index) > 0):
        flo_weighted = (df_group['Weight'] * df_group['Ret_USD']).sum() / df_group['Weight'].sum()
    return flo_weighted
### Global container:
dict_all_weighted = {}
### Number of datasets active:
int_len = -1
### Looping over returns dates:
#for iter_date in sorted(ser_ret_ave.dropna().index.get_level_values('Date').unique()):
for iter_date in [pd.to_datetime('2020-03-31'), pd.to_datetime('2020-04-30')]:
    ### Dates defining:
    date_bm_end = iter_date + pd.offsets.BMonthEnd(0)
    if (date_bm_end > iter_date):
        date_bm_end = date_bm_end - pd.offsets.BMonthEnd(1)
#    print(iter_date, '/', date_bm_end)
    ### Daily returns extraction:
    ser_iter_ret = ser_ret_ave.loc[iter_date]
    ### Daily container:
    dict_iter_weighted = {}
#    for iter_dataset in dict_dataset:
    for iter_dataset in ['imf_dots']:
        ### Loading raw dataset:
        ser_iter_raw = dict_dataset[iter_dataset].loc[: (date_bm_end - pd.offsets.BMonthEnd(dict_lag[iter_dataset]))]
        if (ser_iter_raw.count() > 0):
            ### Defining last date by lagging original dataset:
            date_last = ser_iter_raw.index.get_level_values('Date')[-1]
#            date_last = (date_bm_end - pd.offsets.BMonthEnd(dict_lag[iter_dataset]))          
            date_prev = date_last - pd.offsets.BMonthEnd(dict_ffill[iter_dataset])
    #        print(iter_date, '/', date_bm_end, '/', date_prev, ':', date_last)
            ### Perform lagging & forward filling on annualized dataset:
            ser_iter_ann = dict_annual[iter_dataset].loc[date_prev : date_last].groupby(['Reporter', 'Partner']).ffill(limit = dict_ffill[iter_dataset])
            ### Taking last date value:
            ser_iter_last = ser_iter_ann[date_last]
            ### Calculating of weighted average of returns:
            df_to_weight = ser_iter_last.to_frame().join(ser_iter_ret)
            df_to_weight.columns = ['Weight', 'Ret_USD']
            df_to_weight['Weight'] = df_to_weight['Weight'].groupby('Reporter').apply(lambda ser_group: ser_group / ser_group.sum())
#            print(df_to_weight.loc['AE'])
            ### Simple weighted average:
#            dict_iter_weighted[iter_dataset] = df_to_weight.groupby('Reporter').apply(get_weighted_ret)            
            ### Weighted average with threshold by partners number:
#            dict_iter_weighted[iter_dataset] = df_to_weight.groupby('Reporter').apply(get_weighted_ret, int_select_top)
            ### Weighted average with threshold by share:
            dict_iter_weighted[iter_dataset] = df_to_weight.groupby('Reporter').apply(get_weighted_ret, None, flo_select_share)
            ### Weighted average with both thresholds:
#            dict_iter_weighted[iter_dataset] = df_to_weight.groupby('Reporter').apply(get_weighted_ret, int_select_top, flo_select_share)
    if (len(dict_iter_weighted) > 0):
        ### Daily results aggregation:
        dict_all_weighted[iter_date] = pd.concat(dict_iter_weighted, axis = 1, sort = False)
    if (int_len != len(dict_iter_weighted)):
        int_len = len(dict_iter_weighted)
        print(iter_date, ':', str(int_len))
### Global results aggregation:
df_all_weighted = pd.concat(dict_all_weighted, axis = 0, sort = False) 
df_all_weighted.index.names = ['Date', 'Country']
### Global results saving:
#df_all_weighted.to_hdf(str_path_ret_weighted, str_key_weighted, mode = 'w', format = 'table')

2020-03-31 00:00:00 : 1


In [107]:
### RUN EVERY TIME: ALTERNATIVE WEIGHTED AVERAGE OF RETURNS CALCULATION FOR IMF DOTS

gc.collect()
### Weighted returns calculator:
def get_weighted_ret(df_group, int_select_top = None, flo_select_share = None):
    df_group = df_group.dropna()
    if (int_select_top is not None):
        df_group = df_group.sort_values('Weight', ascending = False)[ : int_select_top]
    if (flo_select_share is not None):
        df_group = df_group[df_group['Weight'] > flo_select_share]
    flo_weighted = np.NaN
    if (len(df_group.index) > 0):
        flo_weighted = (df_group['Weight'] * df_group['Ret_USD']).sum() / df_group['Weight'].sum()
    return flo_weighted
### Global container:
dict_all_weighted = {}
### Loading World Export Annualized:
ser_world_export_ann = pd.read_hdf(str_path_world_export_annualized, str_key_world_export_annualized)
### Number of datasets active:
int_len = -1
### Looping over returns dates:
for iter_date in sorted(ser_ret_ave.dropna().index.get_level_values('Date').unique()):
#for iter_date in [pd.to_datetime('2020-03-31'), pd.to_datetime('2020-04-30')]:
    ### Dates defining:
    date_bm_end = iter_date + pd.offsets.BMonthEnd(0)
    if (date_bm_end > iter_date):
        date_bm_end = date_bm_end - pd.offsets.BMonthEnd(1)
#    print(iter_date, '/', date_bm_end)
    ### Daily returns extraction:
    ser_iter_ret = ser_ret_ave.loc[iter_date]
    ### Daily container:
    dict_iter_weighted = {}
#    for iter_dataset in dict_dataset:
    for iter_dataset in ['imf_dots']:
        ### Loading raw dataset:
        ser_iter_raw = dict_dataset[iter_dataset].loc[: (date_bm_end - pd.offsets.BMonthEnd(dict_lag[iter_dataset]))]
        if (ser_iter_raw.count() > 0):
            ### Defining last date by lagging original dataset:
            date_last = ser_iter_raw.index.get_level_values('Date')[-1]
#            date_last = (date_bm_end - pd.offsets.BMonthEnd(dict_lag[iter_dataset]))          
            date_prev = date_last - pd.offsets.BMonthEnd(dict_ffill[iter_dataset])
    #        print(iter_date, '/', date_bm_end, '/', date_prev, ':', date_last)
            ### Perform lagging & forward filling on annualized dataset:
            ser_iter_ann = dict_annual[iter_dataset].loc[date_prev : date_last].groupby(['Reporter', 'Partner']).ffill(limit = dict_ffill[iter_dataset])
            ser_iter_world_ann = ser_world_export_ann.loc[date_prev : date_last].groupby(['Reporter']).ffill(limit = dict_ffill[iter_dataset])
            ### Taking last date value:
            ser_iter_last = ser_iter_ann[date_last]
            ser_iter_world_last = ser_iter_world_ann[date_last]
            ### Calculating of weighted average of returns:
            df_to_weight = ser_iter_last.to_frame().join(ser_iter_ret).join(ser_iter_world_last)
            df_to_weight.columns = ['Country_Export', 'Ret_USD', 'World_Export']
            df_to_weight['Weight'] = df_to_weight.groupby('Reporter', group_keys = False).apply(lambda df_group: df_group['Country_Export'] / df_group['World_Export'])
#            print(df_to_weight.loc['US'])
            ### Simple weighted average:
#            dict_iter_weighted[iter_dataset] = df_to_weight.groupby('Reporter').apply(get_weighted_ret)            
            ### Weighted average with threshold by partners number:
#            dict_iter_weighted[iter_dataset] = df_to_weight.groupby('Reporter').apply(get_weighted_ret, int_select_top)
            ### Weighted average with threshold by share:
            dict_iter_weighted[iter_dataset] = df_to_weight.groupby('Reporter').apply(get_weighted_ret, None, flo_select_share)
            ### Weighted average with both thresholds:
#            dict_iter_weighted[iter_dataset] = df_to_weight.groupby('Reporter').apply(get_weighted_ret, int_select_top, flo_select_share)
    if (len(dict_iter_weighted) > 0):
        ### Daily results aggregation:
        dict_all_weighted[iter_date] = pd.concat(dict_iter_weighted, axis = 1, sort = False)
    if (int_len != len(dict_iter_weighted)):
        int_len = len(dict_iter_weighted)
        print(iter_date, ':', str(int_len))
### Global results aggregation:
ser_all_weighted = pd.concat(dict_all_weighted, axis = 0, sort = False).squeeze()
ser_all_weighted.index.names = ['Date', 'Country']
### Global results saving:
df_all_weighted['imf_dots'] = ser_all_weighted
df_all_weighted.to_hdf(str_path_ret_weighted, str_key_weighted, mode = 'w', format = 'table')

2020-03-31 00:00:00 : 1


In [105]:
### RUN EVERY TIME: ADDING WEIGHTED RETURNS TO DATA COLLECTION

df_all_weighted = pd.read_hdf(str_path_ret_weighted, str_key_weighted).reset_index('Country')
df_all_weighted['Country'] = df_all_weighted['Country'].astype(str)
ser_weighted = df_all_weighted.set_index('Country', append = True).stack()
ser_weighted.index.names = ['Date', 'Country', 'Activity']
ser_weighted.to_hdf(str_path_gravity_results, str_key_ret_weighted, mode = 'a')

In [32]:
### RUN EVERY TIME: HDF TO CSV

ser_activity_sum = pd.read_hdf(str_path_gravity_results, str_key_activity_sum)
ser_activity_sum.fillna(0.0).to_csv(str_activity_sum_csv_path, sep = ';', header = False)
ser_activity_share = pd.read_hdf(str_path_gravity_results, str_key_activity_share)
ser_activity_share.replace(0.0, np.NaN).to_csv(str_activity_share_csv_path, sep = ';', header = False)
ser_gdp_total = pd.read_hdf(str_path_gravity_results, str_key_gdp_total)
ser_gdp_total.to_csv(str_gdp_total_csv_path, sep = ';', header = False)
ser_herfindahl = pd.read_hdf(str_path_gravity_results, str_key_herfindahl)
ser_herfindahl.to_csv(str_herfindahl_csv_path, sep = ';', header = False)
ser_openess = pd.read_hdf(str_path_gravity_results, str_key_openess)
ser_openess.to_csv(str_openess_csv_path, sep = ';', header = False)
ser_ret_weighted = pd.read_hdf(str_path_gravity_results, str_key_ret_weighted)
ser_ret_weighted.to_csv(str_ret_weighted_csv_path, sep = ';', header = False)

In [None]:
### TEMP

pd.read_csv(str_ret_weighted_csv_path, sep = ';', header = None, parse_dates = [0]).dropna()

In [108]:
### TEMP

ser_ret_weighted = pd.read_hdf(str_path_gravity_results, str_key_ret_weighted)
ser_ret_weighted.to_csv(str_ret_weighted_csv_path, sep = ';', header = False)
#ser_ret_weighted['2000-12-29', 'US']

In [None]:
### TEMP

### RUN EVERY TIME: WEIGHTED AVERAGE OF RETURNS CALCULATION

gc.collect()
### Weighted returns calculator:
def get_weighted_ret(df_group, int_select_top = None, flo_select_share = None):
    df_group = df_group.dropna()
    if (int_select_top is not None):
        df_group = df_group.sort_values('Weight', ascending = False)[ : int_select_top]
    if (flo_select_share is not None):
        df_group = df_group[df_group['Weight'] > flo_select_share]
    flo_weighted = np.NaN
    if (len(df_group.index) > 0):
        flo_weighted = (df_group['Weight'] * df_group['Ret_USD']).sum() / df_group['Weight'].sum()
    return flo_weighted
### Global container:
dict_all_weighted = {}
### Number of datasets active:
int_len = -1
### Looping over returns dates:
#for iter_date in sorted(ser_ret_ave.dropna().index.get_level_values('Date').unique()):
#for iter_date in [pd.to_datetime('2020-03-31'), pd.to_datetime('2020-04-30')]:
for iter_date in [pd.to_datetime('2020-03-18')]:
    ### Dates defining:
    date_bm_end = iter_date + pd.offsets.BMonthEnd(0)
    if (date_bm_end > iter_date):
        date_bm_end = date_bm_end - pd.offsets.BMonthEnd(1)
    print(iter_date, '/', date_bm_end)
    ### Daily returns extraction:
    ser_iter_ret = ser_ret_ave.loc[iter_date]
    ### Daily container:
    dict_iter_weighted = {}
#    for iter_dataset in dict_dataset:
    for iter_dataset in ['imf_cpis']:
        ### Loading raw dataset:
        ser_iter_raw = dict_dataset[iter_dataset].loc[: (date_bm_end - pd.offsets.BMonthEnd(dict_lag[iter_dataset]))]
        if (ser_iter_raw.count() > 0):
            ### Defining last date by lagging original dataset:
            date_last = ser_iter_raw.index.get_level_values('Date')[-1]
            date_prev = date_last - pd.offsets.BMonthEnd(dict_ffill[iter_dataset])
            print(iter_date, '/', date_bm_end, '/', date_prev, ':', date_last)
            ### Perform lagging & forward filling on annualized dataset:
            ser_iter_ann = dict_annual[iter_dataset].loc[date_prev : date_last].groupby(['Reporter', 'Partner']).ffill(limit = dict_ffill[iter_dataset])
            ### Taking last date value:
            ser_iter_last = ser_iter_ann[date_last]
            ### Calculating of weighted average of returns:
            df_to_weight = ser_iter_last.to_frame().join(ser_iter_ret)
            df_to_weight.columns = ['Weight', 'Ret_USD']
            df_to_weight['Weight'] = df_to_weight['Weight'].groupby('Reporter').apply(lambda ser_group: ser_group / ser_group.sum())
            ### Weighted average with threshold by share:
            dict_iter_weighted[iter_dataset] = df_to_weight.groupby('Reporter').apply(get_weighted_ret, None, flo_select_share)


In [None]:
### TEMP

sorted(ser_ret_ave.dropna().index.get_level_values('Date').unique())