In [1]:
### RUN EVERY TIME: GRAVITY DATASETS EXPLORING

In [2]:
### RUN EVERY TIME: INITIALIZATION

import pandas as pd
import numpy as np
import math
import gc
import datetime
import os

In [3]:
### RUN EVERY TIME: VERSION CONTROL

from platform import python_version
print('pandas version: ', pd.__version__)
print('python version: ', python_version())

pandas version:  0.25.3
python version:  3.7.4


In [4]:
### RUN EVERY TIME: MAIN CONSTANTS

### MultiIndex level slice constant:
All = slice(None)
### Regions list:
list_regions = ['DM', 'EM', 'FM']
### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### Activities naming:
dict_activity = {}
dict_activity['imf_dots'] = 'Trade Export'
dict_activity['imf_cpis'] = 'Portfolio Investment'
dict_activity['oecd_fdi'] = 'Direct Investment'
dict_activity['bis_lbs'] = 'Bank Lending'
dict_activity['gravity'] = 'Gravity'
### CEPII dataset:
str_path_cepii_dataset = 'Data_Files/Source_Files/cepii_dataset.h5'
str_distance_dataset = 'distance_dataset'
### WB WDI GDP dataset:
str_path_wb_gdp_dataset = 'Data_Files/Source_Files/gdp_dataset.h5'
str_wb_gdp_dataset = 'gdp_dataset'
### BIS Loans dataset:
str_path_bis_lbs_combined = 'Data_Files/Source_Files/bis_combined.h5'
str_full_bis_lbs_combined = 'bis_full_combined'
### IMF CPIS dataset:
str_path_imf_cpis_combined = 'Data_Files/Source_Files/cpis_combined.h5'
str_full_imf_cpis_combined = 'cpis_full_combined'
### Filtered IMF CPIS dataset:
str_path_imf_cpis_filtered = 'Data_Files/Source_Files/cpis_filtered.h5'
str_key_imf_cpis_filtered = 'cpis_filtered'
### IMF DOTS datasets:
str_path_imf_dots_combined = 'Data_Files/Source_Files/dots_combined.h5'
str_full_imf_dots_combined = 'dots_full_combined'
str_path_imf_dots_world = 'Data_Files/Source_Files/dots_world_export.h5'
str_full_imf_dots_world = 'dots_world_export'
### OECD FDI dataset:
str_path_oecd_fdi_combined = 'Data_Files/Source_Files/oecd_combined.h5'
str_full_oecd_fdi_combined = 'oecd_full_combined'
str_path_direct_out_net = 'Data_Files/Source_Files/direct_outward_net.h5'
str_key_direct_out_net = 'direct_outward'
### Technical Constants:
date_start = pd.Timestamp('1989-12-29')
str_date_end = '2022-12-31'
date_end = pd.Timestamp(str_date_end)
date_ison = pd.Timestamp('1994-12-31')
### Distance power for gravity calculation:
flo_dist_power = 1 / 2
### Bloomberg structured data extraction parameters:
str_path_bb_hdf = 'Data_Files/Source_Files/Bloomberg_prepared.h5'
str_key_ret_daily = 'bb_ret_daily'
str_ret_daily_csv_path = 'Data_Files/Source_Files/ret_daily.csv'
### Saved annualized activities:
str_path_act_annualized = 'Data_Files/Source_Files/datasets_annualized.h5'
str_path_act_weights = 'Data_Files/Source_Files/datasets_weights.h5'
str_path_world_export_annualized = 'Data_Files/Source_Files/world_export_annualized.h5'
str_key_world_export_annualized = 'world_export_ann'
### Herfindahl index threshold:
flo_hi_threshold = 0.0 # 1.1
### Saved herfindahl values:
str_path_herfindahl = 'Data_Files/Source_Files/herfindahl_indices.h5'
### Returns average parameters:
int_ave_months = 6
int_halflife_months = 2
### Thresholds for weighted cross-sectional average returns calculation:
int_select_top = 3
flo_select_share = 0.05
### Saved weighted returns:
str_path_ret_weighted = 'Data_Files/Source_Files/returns_weighted.h5'
str_key_weighted = 'ret_weighted'
### Saved export data:
str_path_gravity_results = 'Data_Files/Source_Files/gravity_export.h5'
str_key_activity_sum = 'activity_sum'
str_key_activity_share = 'activity_share'
str_key_gdp_total = 'gdp_total'
str_key_herfindahl = 'herfindahl_index'
str_key_openess = 'openess_measure'
str_key_ret_weighted = 'ret_weighted'
### CSV to Export data:
str_activity_sum_csv_path = 'Data_Files/Test_Files/activity_sum.csv'
str_activity_share_csv_path = 'Data_Files/Test_Files/activity_share.csv'
str_gdp_total_csv_path = 'Data_Files/Test_Files/gdp_total.csv'
str_herfindahl_csv_path = 'Data_Files/Test_Files/herfindahl.csv'
str_openess_csv_path = 'Data_Files/Test_Files/openess.csv'
str_ret_weighted_csv_path = 'Data_Files/Test_Files/ret_weighted.csv'

In [5]:
### RUN EVERY TIME: DATASETS LOADING

dict_dataset = {}
dict_dataset['imf_dots'] = pd.read_hdf(path_or_buf = str_path_imf_dots_combined, key = str_full_imf_dots_combined).droplevel('Market').sort_index()['Export_Augmented']
#dict_dataset['imf_cpis'] = pd.read_hdf(path_or_buf = str_path_imf_cpis_combined, key = str_full_imf_cpis_combined).droplevel('Market').sort_index()['Asset_Augmented']
#dict_dataset['imf_cpis'] = pd.read_hdf(path_or_buf = str_path_imf_cpis_filtered, key = str_key_imf_cpis_filtered)
#dict_dataset['imf_cpis'].loc[dict_dataset['imf_cpis'] < 0.0] = np.NaN
#dict_dataset['oecd_fdi'] = pd.read_hdf(path_or_buf = str_path_oecd_fdi_combined, key = str_full_oecd_fdi_combined).droplevel('Market').sort_index()['Asset']
#dict_dataset['oecd_fdi'].loc[dict_dataset['oecd_fdi'] < 0.0] = np.NaN
#dict_dataset['oecd_fdi'] = pd.read_hdf(path_or_buf = str_path_direct_out_net, key = str_key_direct_out_net)
#dict_dataset['oecd_fdi'].loc[dict_dataset['oecd_fdi'] < 0.0] = np.NaN
#dict_dataset['bis_lbs'] = pd.read_hdf(path_or_buf = str_path_bis_lbs_combined, key = str_full_bis_lbs_combined)\
#                            .set_index(['Date', 'Reporter', 'Partner']).sort_index()['Claim_Augmented']

gc.collect()

0

In [6]:
### RUN EVERY TIME: GRAVITY DATASET CONSTRUCTION

### GDP loading:
ser_gdp = pd.read_hdf(path_or_buf = str_path_wb_gdp_dataset, key = str_wb_gdp_dataset)
### Distances loading:
ser_dist = pd.read_hdf(path_or_buf = str_path_cepii_dataset, key = str_distance_dataset)['distw']
### Distances naming:
ser_dist.index.names = ['Reporter', 'Partner']
ser_dist.name = 'Distance'
### Dropping internal distances:
df_dist = ser_dist.reset_index()
df_dist.drop(df_dist[df_dist['Reporter'] == df_dist['Partner']].index, inplace = True)
ser_dist = df_dist.set_index(['Reporter', 'Partner']).squeeze().sort_index()
### GDP duplicating:
ser_gdp_reporter = ser_gdp[:]
ser_gdp_reporter.index.names = ['Date', 'Reporter']
ser_gdp_reporter.name = 'GDP_Reporter'
ser_gdp_partner = ser_gdp[:]
ser_gdp_partner.index.names = ['Date', 'Partner']
ser_gdp_partner.name = 'GDP_Partner'
### Reporters data connecting:
df_reporter = ser_dist.to_frame().join(ser_gdp_reporter).sort_index()
### Partners data connecting:
df_partner = ser_dist.to_frame().join(ser_gdp_partner).drop('Distance', axis = 1).sort_index()
df_partner = df_partner.reorder_levels([1, 0, 2])
### Joining data and Gravity calculation:
df_gravity = pd.concat([df_reporter, df_partner], axis = 1)
df_gravity = df_gravity.reset_index('Date').dropna(subset = ['Date']).set_index('Date', append = True).reorder_levels([2, 0, 1]).sort_index()
#display(df_gravity.loc[('2020-12-31', ['US', 'IL'], ['US', 'IL']), :])
ser_gravity = (df_gravity['GDP_Reporter'] / 10 ** 9) * (df_gravity['GDP_Partner'] / 10 ** 9) / (df_gravity['Distance'] ** flo_dist_power)
### Adding gravity to activities:
dict_dataset['gravity'] = ser_gravity.sort_index()

In [8]:
### ANNUALIZATION CONSTANTS INITIALIZATION

gc.collect()
### Lags initialization:
dict_lag = {}
dict_lag['imf_dots'] = 3
#dict_lag['imf_cpis'] = 6
#dict_lag['oecd_fdi'] = 24
#dict_lag['bis_lbs'] = 6
dict_lag['gravity'] = 9
### Periods to fill initialization:
dict_ffill = {}
dict_ffill['imf_dots'] = 1
#dict_ffill['imf_cpis'] = 12
#dict_ffill['oecd_fdi'] = 12
#dict_ffill['bis_lbs'] = 3
dict_ffill['gravity'] = 12

In [208]:
### TEMP

gc.collect()

iter_activity = 'imf_dots'
#iter_activity = 'bis_lbs'
#iter_activity = 'gravity'
int_lag = dict_lag[iter_activity]
int_period = dict_ffill[iter_activity]

ser_group = dict_dataset[iter_activity].loc[:, ['US'], ['CN']]

if True:
    ser_group = ser_group.droplevel(['Reporter', 'Partner'])
    ser_group = ser_group.append(pd.Series(np.NaN, index = [ser_group.index[0] - pd.offsets.BMonthEnd(int_period)])).sort_index()
    idx_dates = pd.date_range(ser_group.index[0], date_end, freq = 'BM')
    ser_result = pd.Series(np.NaN, index = idx_dates)    
    for iter_date in idx_dates:
#    for iter_date in idx_dates[-5 :]:        
#    for iter_date in [pd.to_datetime('1973-07-31')]:
        ### Dates defining:
        date_bm_end = iter_date + pd.offsets.BMonthEnd(0)
        if (date_bm_end > iter_date):
            date_bm_end = date_bm_end - pd.offsets.BMonthEnd(1)      
        date_known_end = date_bm_end - pd.offsets.BMonthEnd(int_lag)
        ser_to_date = ser_group.loc[date_known_end - pd.offsets.BMonthEnd(12 + int_period): date_known_end]
        if len(ser_to_date >= 12):
            ser_resampled = (ser_to_date / int_period).resample('BM').bfill()
            ser_prolonged = ser_resampled[:-1].append(pd.Series(ser_resampled[-1], index = pd.date_range(ser_resampled.index[-1], date_bm_end, freq = 'BM')))
            ser_annualized = ser_prolonged.rolling(12, 12).sum()
            flo_result = ser_annualized[-1]
            ser_result[iter_date] = ser_annualized[-1]      
#        break

In [10]:
### TEMP

#print(iter_date.date())
#print(ser_to_date)
#print(ser_resampled)
#print(ser_prolonged)
#print(flo_result)
#print(ser_prolonged[-12 :].sum())
#ser_result.dropna()[-20 :]

In [11]:
### TEMP

gc.collect()

def annualize_with_lag(ser_group, int_lag, int_period):
    ser_group = ser_group.droplevel(['Reporter', 'Partner'])
    ser_group = ser_group.append(pd.Series(np.NaN, index = [ser_group.index[0] - pd.offsets.BMonthEnd(int_period)])).sort_index()
    idx_dates = pd.date_range(ser_group.index[0], date_end, freq = 'BM')
    ser_result = pd.Series(np.NaN, index = idx_dates)    
    for iter_date in idx_dates:
#    for iter_date in idx_dates[-5 :]:        
#    for iter_date in [pd.to_datetime('1973-07-31')]:
        ### Dates defining:
        date_bm_end = iter_date + pd.offsets.BMonthEnd(0)
        if (date_bm_end > iter_date):
            date_bm_end = date_bm_end - pd.offsets.BMonthEnd(1)      
        date_known_end = date_bm_end - pd.offsets.BMonthEnd(int_lag)
        ser_to_date = ser_group.loc[date_known_end - pd.offsets.BMonthEnd(12 + int_period): date_known_end]
        if len(ser_to_date >= 12):
            ser_resampled = (ser_to_date / int_period).resample('BM').bfill()
            ser_prolonged = ser_resampled[:-1].append(pd.Series(ser_resampled[-1], index = pd.date_range(ser_resampled.index[-1], date_bm_end, freq = 'BM')))
            ser_annualized = ser_prolonged.rolling(12, 12).sum()
#            flo_result = ser_annualized[-1]
            ser_result[iter_date] = ser_annualized[-1]      
#        break
    ### Results output:
    return ser_result
### Deleting existing file with annualized data:
if os.path.exists(str_path_act_annualized):
    os.remove(str_path_act_annualized)
    print(str_path_act_annualized, 'File removed')
### Looping over activities:    
for iter_activity in dict_lag:
    ser_iter_raw = dict_dataset[iter_activity]#.loc[:, ['US'], ['CN']]
    ser_iter_ann = ser_iter_raw.groupby(['Reporter', 'Partner']).apply(annualize_with_lag, dict_lag[iter_activity], dict_ffill[iter_activity])\
                                                                .astype('float32').reorder_levels([2, 0, 1]).sort_index()
#    break
    ser_iter_ann.index.names = ['Date', 'Reporter', 'Partner']
    ser_iter_ann.name = iter_dataset + '_ann'
    ser_iter_ann.to_hdf(str_path_act_annualized, iter_activity, mode = 'a', format = 'table')
    print(iter_dataset, ': annualizing done')    

Data_Files/Source_Files/datasets_annualized.h5 File removed


NameError: name 'iter_dataset' is not defined

In [15]:
### TEMP

ser_iter_ann.to_hdf(str_path_act_annualized, iter_activity, mode = 'a', format = 'table')

In [None]:
### RUN EVERY TIME: GDP & OPENESS MEASURE ANNUALIZATION

### Annualization of differewnt frequency data definition:
def annualize_with_lag(ser_group, int_lag, int_period):
    ser_group = ser_group.droplevel(['Country'])
    ser_group = ser_group.append(pd.Series(np.NaN, index = [ser_group.index[0] - pd.offsets.BMonthEnd(int_period)])).sort_index()
    idx_dates = pd.date_range(ser_group.index[0], date_end, freq = 'BM')
    ser_result = pd.Series(np.NaN, index = idx_dates)    
    for iter_date in idx_dates:
#    for iter_date in idx_dates[-5 :]:        
#    for iter_date in [pd.to_datetime('1973-07-31')]:
        ### Dates defining:
        date_bm_end = iter_date + pd.offsets.BMonthEnd(0)
        if (date_bm_end > iter_date):
            date_bm_end = date_bm_end - pd.offsets.BMonthEnd(1)      
        date_known_end = date_bm_end - pd.offsets.BMonthEnd(int_lag)
        ser_to_date = ser_group.loc[date_known_end - pd.offsets.BMonthEnd(12 + int_period): date_known_end]
        if len(ser_to_date >= 12):
            ser_resampled = (ser_to_date / int_period).resample('BM').bfill()
            ser_prolonged = ser_resampled[:-1].append(pd.Series(ser_resampled[-1], index = pd.date_range(ser_resampled.index[-1], date_bm_end, freq = 'BM')))
            ser_annualized = ser_prolonged.rolling(12, 12).sum()
#            flo_result = ser_annualized[-1]
            ser_result[iter_date] = ser_annualized[-1]      
#        break
    ### Results output:
    return ser_result
### GDP Loading:
ser_gdp = pd.read_hdf(path_or_buf = str_path_wb_gdp_dataset, key = str_wb_gdp_dataset)
### GDP annualization:
ser_gdp_ann = (ser_gdp.groupby(['Country']).apply(annualize_with_lag, dict_lag['gravity'], dict_ffill['gravity'])\
                                           .astype('float32') / 1000000).reorder_levels([1, 0]).sort_index()
ser_gdp_ann.index.names = ['Date', 'Country']
ser_gdp_ann.to_hdf(str_path_gravity_results, str_key_gdp_total, mode = 'a')
### Export data loading & converting:
ser_export_ann = pd.read_hdf(path_or_buf = str_path_act_annualized, key = 'imf_dots')
### Export sum calculation:
ser_export_sum = ser_export_ann.groupby(['Date', 'Reporter']).sum()
ser_export_sum.index.names = ['Date', 'Country']
### Openess measure saving:
(ser_export_sum / ser_gdp_ann).to_hdf(str_path_gravity_results, str_key_openess, mode = 'a')

In [None]:
### RUN EVERY TIME: WORLD EXPORT ANNUALIZATION

### Annualization of different frequency data definition:
def annualize_with_lag(ser_group, int_lag, int_period):
    ser_group = ser_group.droplevel(['Reporter'])
    ser_group = ser_group.append(pd.Series(np.NaN, index = [ser_group.index[0] - pd.offsets.BMonthEnd(int_period)])).sort_index()
    idx_dates = pd.date_range(ser_group.index[0], date_end, freq = 'BM')
    ser_result = pd.Series(np.NaN, index = idx_dates)    
    for iter_date in idx_dates:
#    for iter_date in idx_dates[-5 :]:        
#    for iter_date in [pd.to_datetime('1973-07-31')]:
        ### Dates defining:
        date_bm_end = iter_date + pd.offsets.BMonthEnd(0)
        if (date_bm_end > iter_date):
            date_bm_end = date_bm_end - pd.offsets.BMonthEnd(1)      
        date_known_end = date_bm_end - pd.offsets.BMonthEnd(int_lag)
        ser_to_date = ser_group.loc[date_known_end - pd.offsets.BMonthEnd(12 + int_period): date_known_end]
        if len(ser_to_date >= 12):
            ser_resampled = (ser_to_date / int_period).resample('BM').bfill()
            ser_prolonged = ser_resampled[:-1].append(pd.Series(ser_resampled[-1], index = pd.date_range(ser_resampled.index[-1], date_bm_end, freq = 'BM')))
            ser_annualized = ser_prolonged.rolling(12, 12).sum()
#            flo_result = ser_annualized[-1]
            ser_result[iter_date] = ser_annualized[-1]      
#        break
    ### Results output:
    return ser_result
### World Export Loading:
ser_world_export = pd.read_hdf(path_or_buf = str_path_imf_dots_world, key = str_full_imf_dots_world)
### World Export annualization:
ser_world_export_ann = ser_world_export.groupby(['Reporter']).apply(annualize_with_lag, dict_lag['imf_dots'], dict_ffill['imf_dots'])\
                                                             .astype('float32').reorder_levels([1, 0]).sort_index()
ser_world_export_ann.index.names = ['Date', 'Reporter']
ser_world_export_ann.to_hdf(str_path_world_export_annualized, str_key_world_export_annualized, mode = 'w')