In [1]:
### RUN EVERY TIME: GRAVITY SOURCE DATASETS EXTRACTING

In [2]:
### RUN EVERY TIME: INITIALIZATION

import pandas as pd
import numpy as np
import requests
import json ### To correct JSON structure before unpacking
import gc
import os
import datetime
import time

In [3]:
### NEED TO BE ADOPTED: DEFINING COUNTRY CODES EXTRACTOR

def get_country_codes(use_local_copy = False):  
    ### In case if URL is unavailable:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/countrycode.html'
    ### Online extraction:
    else:
        url_country_code = 'https://countrycode.org/'
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    df_result = df_full_codes[['ISO SHORT', 'ISO LONG']].sort_index()    
    df_result.index = df_result.index.str.upper()
    ### Results output:
    return df_result

In [4]:
### NEED TO BE ADOPTED: DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE (TO BE IGNORED IN PRODUCT CODE)

def ison_membership_converting(str_path_universe, date_end, bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(engine = 'openpyxl', io = str_path_universe, sheet_name = 'Switchers', header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

In [5]:
### NEED TO BE ADOPTED: COMMON DATA EXTRACTION STEPS

### MultiIndex level slice constant:
All = slice(None)
### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### Technical Constants:
str_date_end = '2022-10-31'
### World Country Codes:
df_country_codes = get_country_codes()
### ISON membership history:
ser_ison_membership = ison_membership_converting(str_path_universe, pd.to_datetime(str_date_end))
### ISON SHORT IDs list:
list_ison_countries = sorted(list(map(str, ser_ison_membership.index.get_level_values(1).unique())))
### ISON LONG IDs list:
list_ison_long = list(df_country_codes.loc[df_country_codes['ISO SHORT'].isin(ser_ison_membership.index.get_level_values('Country').unique()), 'ISO LONG'].values)

In [6]:
### IMF DOTS: BILATERAL EXPORTS & IMPORTS (MILLIONS OF USD)

In [7]:
### NEED TO BE ADOPTED: IMF DOTS: DATA SAVING PARAMETERS

### Loaded dataset:
str_path_imf_dots_dataset = 'Data_Files/Source_Files/dots_dataset.h5'
str_key_imf_dots_export = 'dots_export'
str_key_imf_dots_import_inverted = 'dots_import_inverted'
### Resulting dataset:
str_path_imf_dots_augmented = 'Data_Files/Source_Files/dots_augmented_unconditional.h5'
str_key_imf_dots_augmented = 'dots_export_augmented'

In [8]:
### IMF DOTS: LOADING PARAMETERS PREPARATION

### Constants:
All = slice(None)
str_imf_base_url = 'http://dataservices.imf.org/REST/SDMX_JSON.svc/'
str_imf_dataset_add = 'CompactData/'
str_imf_dots_id = 'DOT'
str_dots_freq = 'M'
### Максимальное количество стран в запросе (для соблюдения ограничений API):
int_imf_country_limit = 30

In [None]:
### IMF DOTS: EXPORT DATA EXTRACTION: BILATERAL FLOWS

str_dots_const_url = str_imf_base_url + str_imf_dataset_add + str_imf_dots_id + '/' # Beginning of request URL
str_dots_indicator = 'TXG_FOB_USD'
### Session initializing:
request_session = requests.Session()
### List of bilateral dataframes for future concatenation
list_dots_bilateral = [] 
### Looping over reporters:
for str_reporter in list_ison_countries:
#for str_reporter in ['AU', 'US']:
    ### Generating complete request URL:
    str_dots_full_url = str_dots_const_url + '.'.join([str_dots_freq, str_reporter, str_dots_indicator])
    ### Receiving DOTS dataset from IMF API:
    print(str_reporter, ' / ', str_dots_indicator)
    obj_dots_set = request_session.get(str_dots_full_url)
    ### Data reading as JSON:
    dict_dots_set = json.loads(obj_dots_set.text)
    if ('Series' in dict_dots_set['CompactData']['DataSet']):
        ### Converting each bilateral dataset to dataframe and it's mungling:
        for dict_dots_pair in dict_dots_set['CompactData']['DataSet']['Series']:
            if isinstance(dict_dots_pair['Obs'], list):
                df_dots_bilateral = pd.DataFrame(dict_dots_pair['Obs'])
            else:
                df_dots_bilateral = pd.DataFrame([dict_dots_pair['Obs']])
            ### Markers checking:
            if '@OBS_STATUS' not in df_dots_bilateral.columns:
                df_dots_bilateral['@OBS_STATUS'] = np.NaN
            ### Data extracting and mungling:
            df_dots_bilateral = df_dots_bilateral[['@TIME_PERIOD', '@OBS_VALUE', '@OBS_STATUS']]
            df_dots_bilateral.columns = ['Date', 'Value', 'Status']
            df_dots_bilateral = df_dots_bilateral.assign(Reporter_ID = dict_dots_pair['@REF_AREA'])
            df_dots_bilateral = df_dots_bilateral.assign(Partner_ID = dict_dots_pair['@COUNTERPART_AREA'])
            list_dots_bilateral.append(df_dots_bilateral)
### Flow level data aggregation:
df_dots_indicator = pd.concat(list_dots_bilateral, axis = 0, ignore_index = True)
df_dots_indicator['Date'] = pd.to_datetime(df_dots_indicator['Date']) + pd.offsets.BMonthEnd()
df_dots_indicator = df_dots_indicator[df_dots_indicator['Partner_ID'].isin(df_country_codes['ISO SHORT'].values)].drop('Status', axis = 1)
df_dots_indicator.rename({'Reporter_ID': 'Reporter', 'Partner_ID': 'Partner'}, axis = 1, inplace = True)
### Data saving:
ser_dots_export = df_dots_indicator.set_index(['Date', 'Reporter', 'Partner'])['Value'].sort_index().astype('float16')
del df_dots_indicator
gc.collect()
#ser_dots_export.to_hdf(path_or_buf = str_path_imf_dots_dataset, key = str_key_imf_dots_export, mode = 'w', format = 'fixed')

In [None]:
### IMF DOTS: IMPORT DATA EXTRACTION: BILATERAL FLOWS

str_dots_const_url = str_imf_base_url + str_imf_dataset_add + str_imf_dots_id + '/' # Beginning of request URL
str_dots_indicator = 'TMG_CIF_USD'
### Session initializing:
request_session = requests.Session()
### List of bilateral dataframes for future concatenation
list_dots_bilateral = [] 
### Looping over reporters:
for str_reporter in list_ison_countries:
#for str_reporter in ['AU', 'US']:
    ### Generating complete request URL:
    str_dots_full_url = str_dots_const_url + '.'.join([str_dots_freq, '', str_dots_indicator, str_reporter])
    ### Receiving DOTS dataset from IMF API:
    print(str_reporter, ' / ', str_dots_indicator)
    obj_dots_set = request_session.get(str_dots_full_url)
    ### Data reading as JSON:
    dict_dots_set = json.loads(obj_dots_set.text)
    if ('Series' in dict_dots_set['CompactData']['DataSet']):
        ### Converting each bilateral dataset to dataframe and it's mungling:
        for dict_dots_pair in dict_dots_set['CompactData']['DataSet']['Series']:
            if isinstance(dict_dots_pair['Obs'], list):
                df_dots_bilateral = pd.DataFrame(dict_dots_pair['Obs'])
            else:
                df_dots_bilateral = pd.DataFrame([dict_dots_pair['Obs']])
            ### Markers checking:
            if '@OBS_STATUS' not in df_dots_bilateral.columns:
                df_dots_bilateral['@OBS_STATUS'] = np.NaN
            ### Data extracting and mungling:
            df_dots_bilateral = df_dots_bilateral[['@TIME_PERIOD', '@OBS_VALUE', '@OBS_STATUS']]
            df_dots_bilateral.columns = ['Date', 'Value', 'Status']
            df_dots_bilateral = df_dots_bilateral.assign(Reporter_ID = dict_dots_pair['@REF_AREA'])
            df_dots_bilateral = df_dots_bilateral.assign(Partner_ID = dict_dots_pair['@COUNTERPART_AREA'])
            list_dots_bilateral.append(df_dots_bilateral)
### Flow level data aggregation:
df_dots_indicator = pd.concat(list_dots_bilateral, axis = 0, ignore_index = True)
df_dots_indicator['Date'] = pd.to_datetime(df_dots_indicator['Date']) + pd.offsets.BMonthEnd()
df_dots_indicator = df_dots_indicator[df_dots_indicator['Reporter_ID'].isin(df_country_codes['ISO SHORT'].values)].drop('Status', axis = 1)
df_dots_indicator.rename({'Reporter_ID': 'Partner', 'Partner_ID': 'Reporter'}, axis = 1, inplace = True)
### Data saving:
ser_dots_import_inv = df_dots_indicator.set_index(['Date', 'Reporter', 'Partner'])['Value'].sort_index().astype('float16')
del df_dots_indicator
gc.collect()
#ser_dots_import_inv.to_hdf(path_or_buf = str_path_imf_dots_dataset, key = str_key_imf_dots_import_inverted, mode = 'a', format = 'fixed')

In [8]:
### NEED TO BE ADOPTED: EXPORT & IMPORT DATA AGGREGATION: DATASETS LOADING

### Создаем DataFrame, в который складываем две колонки: Export и Inverted Import:
gc.collect()
ser_dots_export = pd.read_hdf(path_or_buf = str_path_imf_dots_dataset, key = str_key_imf_dots_export)
ser_dots_import_inv = pd.read_hdf(path_or_buf = str_path_imf_dots_dataset, key = str_key_imf_dots_import_inverted)
df_export_aug = pd.concat([ser_dots_export, ser_dots_import_inv], axis = 1, names = 'Source Flow', keys = ['Export', 'Import'])

In [9]:
### IMF DOTS: CIF COEFFICIENTS CALCULATION

### Поскольку мы забираем CIF Import, то, для того, чтобы он был сравним с FOB Export нам нужно учесть расходы на CIF и нивелировать их.
gc.collect()
### Bounds to filter bilateral Import to Export ratio before median calculation: 
flo_lower_bound = 1.0
flo_upper_bound = 2.0
### Bilateral median calculation procedure:
def get_obs_median(df_comm):
    ### Export to Import ratio:
    ser_obs_coeff = df_comm['Import'] / df_comm['Export']
    ### Ratio filtering:
    ser_obs_coeff = ser_obs_coeff.loc[(ser_obs_coeff >= flo_lower_bound) & (ser_obs_coeff <= flo_upper_bound)]
    ### Filtered timeseries median as a result:
    return round(ser_obs_coeff.median(), 2)

### Calulation CIF coefficient for all commodities:
### Для каждой пары Exporter / Importer делаем следующее:
### Выбираем только те даты, где соотношение CIF Import / FOB Export у пары находится в диапазоне от 1 до 2.
### Считаем медианное значение CIF Coefficient = CIF Import / FOB Export
ser_cif_median = df_export_aug.groupby(['Reporter', 'Partner']).apply(get_obs_median)
### Для тех пар, у которых такого значения не существует, определяем в качестве CIF Coefficient медиану среди медиан
ser_cif_median.fillna(ser_cif_median.median(), inplace = True)
ser_cif_median.name = 'CIF_Coefficient'

In [10]:
### IMF DOTS: IMPORT DATA CORRECTION:

### Adding CIF coefficients to dataset:
### Дополняем DataFrame колонкой CIF Coefficient:
df_export_cif = df_export_aug.merge(ser_cif_median, left_index = True, right_index = True)
del df_export_aug
gc.collect()
df_export_cif = df_export_cif.reorder_levels(['Date', 'Reporter', 'Partner']).sort_index()
### Import correction:
### Считаем Corrected Import как CIF Import / CIF Coefficient
df_export_cif['Import_Corrected'] = df_export_cif['Import'] / df_export_cif['CIF_Coefficient'].astype('float16')
df_export_cif.drop(['Import', 'CIF_Coefficient'], axis = 1, inplace = True)

In [11]:
### IMF DOTS: UNCONDITIONAL COMBINATION

### Combining Export & Import data:
### Наним нулевые значения Export, чтобы они не помешали нам забрать ненулевой Import
df_export_cif.loc[df_export_cif['Export'] == 0.0, 'Export'] = np.NaN 
### Дополняем Export данными Corrected Import:
ser_export_augmented = df_export_cif['Export'].combine_first(df_export_cif['Import_Corrected'])

In [13]:
### NEED TO BE ADOPTED: IMF DOTS: RESULTS SAVING

ser_export_augmented.to_hdf(path_or_buf = str_path_imf_dots_augmented, key = str_key_imf_dots_augmented, mode = 'w', format = 'fixed')

In [6]:
### IMF CPIS: BILATERAL EQUITY INVESTMENT POSITIONS (MILLIONS OF USD)

In [7]:
### NEED TO BE ADOPTED: IMF CPIS: DATA SAVING PARAMETERS

### Loaded dataset:
str_path_imf_cpis_detailed_raw = 'Data_Files/Source_Files/cpis_detailed_raw.h5'
str_key_imf_cpis_assets = 'cpis_detailed_assets'
str_key_imf_cpis_liabilities = 'cpis_detailed_liabilities'
### Filtered dataset:
str_path_imf_cpis_filtered = 'Data_Files/Source_Files/cpis_filtered.h5'
str_key_imf_cpis_filtered_asset = 'cpis_filtered_asset'
str_key_imf_cpis_filtered_liability = 'cpis_filtered_liability'
### Resulting dataset:
str_path_imf_cpis_augmented = 'Data_Files/Source_Files/cpis_augmented_unconditional.h5'
str_key_imf_cpis_augmented = 'cpis_asset_augmented'

In [8]:
### IMF CPIS: LOADING PARAMETERS PREPARATION

### Constants:
All = slice(None)
str_imf_base_url = 'http://dataservices.imf.org/REST/SDMX_JSON.svc/'
str_imf_dataset_add = 'CompactData/'
str_imf_cpis_id = 'CPIS'
str_cpis_freq = 'A'
int_seconds_to_sleep = 1
int_imf_country_limit = 30
list_sector_filtered = ['T', 'CB', 'GG', 'HH', 'NP']
dict_indicator = {'I_A_E_T_T_BP6_USD': 'Assets, Equity, BPM6, US Dollars', 'I_L_E_T_T_BP6_USD': 'Liabilities, Equity, BPM6, US Dollars'}

In [None]:
### IMF CPIS : REPORTED PORTFOLIO INVESTMENT ASSETS DATASET RETRIEVING

gc.collect()
### List of bilateral dataframes for future concatenation:
list_cpis_bilateral = [] 
### Beggining of request URL:
str_cpis_const_url = str_imf_base_url + str_imf_dataset_add + str_imf_cpis_id + '/'
### Session initializing:
request_session = requests.Session()
### Looping over reporter:
for iter_investor in list_ison_countries:
#for iter_investor in ['AU', 'US']:  
    ### Looping over indicator:
    for iter_indicator in dict_indicator:        
        if (iter_indicator[2] == 'A'):
            str_reporter_sector = '+'.join(list_sector_filtered)        
            str_partner_sector = '+'.join(list_sector_filtered)
            str_cpis_full_url = str_cpis_const_url + '.'.join([str_cpis_freq, iter_investor, iter_indicator, str_reporter_sector, str_partner_sector])
            obj_cpis_set = request_session.get(str_cpis_full_url)
            ### Data reading as JSON:
            dict_cpis_set = json.loads(obj_cpis_set.text.replace('@OBS_STATUS', '@OBS_VALUE'))
            ### Converting each bilateral dataset to dataframe and it's mungling:
            if ('Series' in dict_cpis_set['CompactData']['DataSet']):
                if isinstance(dict_cpis_set['CompactData']['DataSet']['Series'], list):
                    list_series = dict_cpis_set['CompactData']['DataSet']['Series']
                else:
                    list_series = [dict_cpis_set['CompactData']['DataSet']['Series']]
                for dict_cpis_pair in list_series:
                    if isinstance(dict_cpis_pair['Obs'], list):
                        dict_bilateral = dict_cpis_pair['Obs']
                    else:
                        dict_bilateral = [dict_cpis_pair['Obs']]
                    df_cpis_bilateral = pd.DataFrame(dict_bilateral)
                    df_cpis_bilateral = df_cpis_bilateral[['@TIME_PERIOD', '@OBS_VALUE']]
                    df_cpis_bilateral.columns = ['Date', 'Value']
                    df_cpis_bilateral = df_cpis_bilateral.assign(Indicator = dict_cpis_pair['@INDICATOR'])
                    df_cpis_bilateral = df_cpis_bilateral.assign(Reporter_Sector = dict_cpis_pair['@REF_SECTOR'])
                    df_cpis_bilateral = df_cpis_bilateral.assign(Partner_Sector = dict_cpis_pair['@COUNTERPART_SECTOR'])                    
                    df_cpis_bilateral = df_cpis_bilateral.assign(Reporter_ID = dict_cpis_pair['@REF_AREA'])
                    df_cpis_bilateral = df_cpis_bilateral.assign(Partner_ID = dict_cpis_pair['@COUNTERPART_AREA'])
                    list_cpis_bilateral.append(df_cpis_bilateral)  
            else:
                print('No data in response of the next request:\n', str_cpis_full_url)
            time.sleep(int_seconds_to_sleep)                    
#        break
    print(iter_investor, ': loading completed')
#    break
### Bilateral datasets aggregating:
df_cpis_raw = pd.concat(list_cpis_bilateral, axis = 0, ignore_index = True)
df_cpis_raw['Date'] = pd.to_datetime(df_cpis_raw['Date']) + pd.offsets.BYearEnd()
df_cpis_raw.loc[df_cpis_raw['Value'] == 'C', 'Value'] = np.NaN
df_cpis_raw.loc[df_cpis_raw['Value'] == '-', 'Value'] = np.NaN
df_cpis_raw = df_cpis_raw[df_cpis_raw['Reporter_ID'] != df_cpis_raw['Partner_ID']]
df_cpis_raw = df_cpis_raw[df_cpis_raw['Partner_ID'].isin(df_country_codes['ISO SHORT'].values)]
df_cpis_raw.rename({'Reporter_ID': 'Reporter', 'Partner_ID': 'Partner'}, axis = 1, inplace = True)
df_cpis_raw = df_cpis_raw.astype({'Indicator': 'str', 'Reporter_Sector': 'str', 'Partner_Sector': 'str', 'Reporter': 'str', 'Partner': 'str', 
                                  'Value': 'float32'})    
### Data saving:
ser_cpis_asset = df_cpis_raw.set_index(['Date', 'Indicator', 'Reporter_Sector', 'Partner_Sector', 'Reporter', 'Partner'])['Value'].sort_index().astype('float32')
del df_cpis_raw
gc.collect()
#ser_cpis_asset.to_hdf(path_or_buf = str_path_imf_cpis_detailed_raw, key = str_key_imf_cpis_assets, mode = 'w', format = 'fixed')

In [None]:
### IMF CPIS : REPORTED PORTFOLIO INVESTMENT LIABILITIES DATASET RETRIEVING

gc.collect()
### List of bilateral dataframes for future concatenation:
list_cpis_bilateral = [] 
### Beggining of request URL:
str_cpis_const_url = str_imf_base_url + str_imf_dataset_add + str_imf_cpis_id + '/' 
### Session initializing:
request_session = requests.Session()
### Looping over reporter:
for iter_investor in list_ison_countries:
#for iter_investor in ['AU', 'US']:   
    ### Looping over indicator:
    for iter_indicator in dict_indicator:        
        if (iter_indicator[2] == 'L'):
            str_reporter_sector = '+'.join(list_sector_filtered)        
            str_partner_sector = '+'.join(list_sector_filtered)
            str_cpis_full_url = str_cpis_const_url + '.'.join([str_cpis_freq, '', iter_indicator, str_reporter_sector, str_partner_sector, iter_investor])
            obj_cpis_set = request_session.get(str_cpis_full_url)
            ### Data reading as JSON:
            dict_cpis_set = json.loads(obj_cpis_set.text.replace('@OBS_STATUS', '@OBS_VALUE'))
            ### Converting each bilateral dataset to dataframe and it's mungling:
            if ('Series' in dict_cpis_set['CompactData']['DataSet']):
                if isinstance(dict_cpis_set['CompactData']['DataSet']['Series'], list):
                    list_series = dict_cpis_set['CompactData']['DataSet']['Series']
                else:
                    list_series = [dict_cpis_set['CompactData']['DataSet']['Series']]
                for dict_cpis_pair in list_series:
                    if isinstance(dict_cpis_pair['Obs'], list):
                        dict_bilateral = dict_cpis_pair['Obs']
                    else:
                        dict_bilateral = [dict_cpis_pair['Obs']]
                    df_cpis_bilateral = pd.DataFrame(dict_bilateral)
                    df_cpis_bilateral = df_cpis_bilateral[['@TIME_PERIOD', '@OBS_VALUE']]
                    df_cpis_bilateral.columns = ['Date', 'Value']
                    df_cpis_bilateral = df_cpis_bilateral.assign(Indicator = dict_cpis_pair['@INDICATOR'])
                    df_cpis_bilateral = df_cpis_bilateral.assign(Reporter_S = dict_cpis_pair['@REF_SECTOR'])
                    df_cpis_bilateral = df_cpis_bilateral.assign(Partner_S = dict_cpis_pair['@COUNTERPART_SECTOR'])                    
                    df_cpis_bilateral = df_cpis_bilateral.assign(Reporter_ID = dict_cpis_pair['@REF_AREA'])
                    df_cpis_bilateral = df_cpis_bilateral.assign(Partner_ID = dict_cpis_pair['@COUNTERPART_AREA'])
                    list_cpis_bilateral.append(df_cpis_bilateral)  
            else:
                print('No data in response of the next request:\n', str_cpis_full_url)
            time.sleep(int_seconds_to_sleep)                    
#        break
    print(iter_investor, ': loading completed')
#    break
### Bilateral datasets aggregating:
df_cpis_raw = pd.concat(list_cpis_bilateral, axis = 0, ignore_index = True)
df_cpis_raw['Date'] = pd.to_datetime(df_cpis_raw['Date']) + pd.offsets.BYearEnd()
df_cpis_raw.loc[df_cpis_raw['Value'] == 'C', 'Value'] = np.NaN
df_cpis_raw.loc[df_cpis_raw['Value'] == '-', 'Value'] = np.NaN
df_cpis_raw = df_cpis_raw[df_cpis_raw['Reporter_ID'] != df_cpis_raw['Partner_ID']]
df_cpis_raw = df_cpis_raw[df_cpis_raw['Reporter_ID'].isin(df_country_codes['ISO SHORT'].values)]
df_cpis_raw.rename({'Reporter_ID': 'Partner', 'Partner_ID': 'Reporter', 'Reporter_S': 'Partner_Sector', 'Partner_S': 'Reporter_Sector'}, axis = 1, inplace = True)
df_cpis_raw = df_cpis_raw.astype({'Indicator': 'str', 'Reporter_Sector': 'str', 'Partner_Sector': 'str', 'Reporter': 'str', 'Partner': 'str', 
                                  'Value': 'float32'})    
### Data saving:
ser_cpis_liability_inv = df_cpis_raw.set_index(['Date', 'Indicator', 'Reporter_Sector', 'Partner_Sector', 'Reporter', 'Partner'])['Value'].sort_index()\
                                    .astype('float32')
del df_cpis_raw
gc.collect()
#ser_cpis_liability_inv.to_hdf(path_or_buf = str_path_imf_cpis_detailed_raw, key = str_key_imf_cpis_liabilities, mode = 'a', format = 'fixed')

In [17]:
### IMF CPIS: RAW ASSET DATA FILTERING

gc.collect()
ser_cpis_asset = pd.read_hdf(path_or_buf = str_path_imf_cpis_detailed_raw, key = str_key_imf_cpis_assets)

list_valid_reporter_sectors = ['T', 'CB', 'GG']
list_valid_partner_sectors = ['T', 'CB', 'GG']
list_valid_partners = df_country_codes['ISO SHORT'].values

ser_asset_to_filter = ser_cpis_asset.loc[:, 'I_A_E_T_T_BP6_USD', list_valid_reporter_sectors, list_valid_partner_sectors, :, list_valid_partners]\
                                    .reorder_levels([0, 1, 4, 5, 2, 3]).sort_index().astype('float32')
df_partner_sector_unstacked = ser_asset_to_filter.unstack('Partner_Sector').fillna(0.0)
ser_partner_sector_filtered = df_partner_sector_unstacked['T'] - (df_partner_sector_unstacked['CB'] + df_partner_sector_unstacked['GG'])
df_reporter_sector_unstacked = ser_partner_sector_filtered.unstack('Reporter_Sector').fillna(0.0)
ser_reporter_sector_filtered = df_reporter_sector_unstacked['T'] - (df_reporter_sector_unstacked['CB'] + df_reporter_sector_unstacked['GG'])
ser_asset_filtered = ser_reporter_sector_filtered.droplevel('Indicator').sort_index()
del ser_cpis_asset
del df_partner_sector_unstacked
del df_reporter_sector_unstacked
del ser_reporter_sector_filtered
gc.collect()

55

In [18]:
### NEED TO BE ADOPTED: IMF CPIS: FILTERED ASSET DATASET SAVING

ser_asset_filtered.replace({0.0: np.NaN}).to_hdf(path_or_buf = str_path_imf_cpis_filtered, key = str_key_imf_cpis_filtered_asset, mode = 'w', format = 'fixed')

In [19]:
### IMF CPIS: RAW LIABILITY DATA FILTERING

gc.collect()
ser_cpis_liability_inv = pd.read_hdf(path_or_buf = str_path_imf_cpis_detailed_raw, key = str_key_imf_cpis_liabilities)

list_valid_reporter_sectors = ['T', 'CB', 'GG']
list_valid_partner_sectors = ['T', 'CB', 'GG']
list_valid_partners = df_country_codes['ISO SHORT'].values
ser_liability_to_filter = ser_cpis_liability_inv.loc[:, 'I_L_E_T_T_BP6_USD', list_valid_reporter_sectors, list_valid_partner_sectors, :, list_valid_partners]\
                                                .reorder_levels([0, 1, 4, 5, 2, 3]).sort_index().astype('float32')
ser_liability_filtered = ser_liability_to_filter.fillna(0.0).droplevel(['Reporter_Sector', 'Partner_Sector']).droplevel('Indicator').sort_index()
del ser_liability_to_filter
gc.collect()

55

In [20]:
### NEED TO BE ADOPTED: IMF CPIS: FILTERED LIABILITY DATASET SAVING

ser_liability_filtered.replace({0.0: np.NaN}).to_hdf(path_or_buf = str_path_imf_cpis_filtered, key = str_key_imf_cpis_filtered_liability, mode = 'a', format = 'fixed')

In [21]:
### IMF CPIS: FILTERED DATA AGGREGATION: DATASETS LOADING

gc.collect()

ser_cpis_asset_filtered = pd.read_hdf(path_or_buf = str_path_imf_cpis_filtered, key = str_key_imf_cpis_filtered_asset)
ser_cpis_asset_filtered.name = 'Asset'
ser_cpis_liability_filtered = pd.read_hdf(path_or_buf = str_path_imf_cpis_filtered, key = str_key_imf_cpis_filtered_liability)
ser_cpis_liability_filtered.name = 'Liability_Inverted'
df_cpis_total = pd.concat([ser_cpis_asset_filtered, ser_cpis_liability_filtered], axis = 1, names = 'Data Source').astype('float32').round(2)

In [22]:
### IMF CPIS: UNCONDITIONAL COMBINATION

### Combining Export & Import data:
df_cpis_total.loc[df_cpis_total['Asset'] == 0.0, 'Asset'] = np.NaN 
ser_cpis_augmented = df_cpis_total['Asset'].combine_first(df_cpis_total['Liability_Inverted'])

In [23]:
### NEED TO BE ADOPTED: IMF CPIS: RESULTS SAVING

ser_cpis_augmented.to_hdf(path_or_buf = str_path_imf_cpis_augmented, key = str_key_imf_cpis_augmented, mode = 'w', format = 'fixed')

In [24]:
### IMF CPIS: RESULTS CHECK

ser_cpis_test_new = pd.read_hdf(path_or_buf = str_path_imf_cpis_augmented, key = str_key_imf_cpis_augmented)
ser_cpis_test_new.name = 'New'
ser_cpis_test_old = pd.read_hdf(path_or_buf = 'Data_Files/Source_Files/cpis_equity_options.h5')['Unconditional']
ser_cpis_test_old.name = 'Old'
display(pd.concat([ser_cpis_test_new.describe(), ser_cpis_test_old.describe()], axis = 1))
print(str(len(ser_cpis_test_new)), '~', str(len(ser_cpis_test_old)))

Unnamed: 0,New,Old
count,72914.0,72914.0
mean,4746.688,4746.688
std,36844.04,36844.04
min,-11520.74,-11520.74
25%,1.62,1.62
50%,22.85,22.85
75%,383.7775,383.7775
max,2023154.0,2023154.0


194927 ~ 194927


In [23]:
### IMF CDIS: BILATERAL DIRECT INVESTMENT POSITIONS (MILLIONS OF USD)

In [24]:
### NEED TO BE ADOPTED: IMF CDIS: DATA SAVING PARAMETERS

### Filtered dataset:
str_path_imf_cdis_dataset = 'Data_Files/Source_Files/cdis_assets.h5'
str_key_do_debt_imf_cdis_dataset = 'cdis_debt_outward_assets'
str_key_di_debt_imf_cdis_dataset = 'cdis_debt_inward_assets'
### Resulting dataset:
str_path_imf_cdis_augmented = 'Data_Files/Source_Files/cdis_augmented_unconditional.h5'
str_key_imf_cdis_augmented = 'cdis_asset_augmented'

In [25]:
### IMF CDIS: LOADING PARAMETERS PREPARATION

### Constants:
All = slice(None)
str_imf_base_url = 'http://dataservices.imf.org/REST/SDMX_JSON.svc/'
str_imf_dataset_add = 'CompactData/'
str_imf_cdis_id = 'CDIS'
str_cdis_freq = 'A'
int_seconds_to_sleep = 3
int_imf_country_limit = 30
dict_to_download = {'IOWDA_BP6_USD': 'Outward Debt Instruments Assets Positions (Gross), US Dollars',
                    'IOWDL_BP6_USD': 'Outward Debt Instruments Liabilities Positions (Gross), US Dollars',
                    'IOWD_BP6_USD': 'Outward Debt Instruments Positions (Net), US Dollars',
                    'IIWDA_BP6_USD': 'Inward Debt Instruments Assets Positions (Gross), US Dollars',
                    'IIWDL_BP6_USD': 'Inward Debt Instruments Liabilities Positions (Gross), US Dollars',
                    'IIWD_BP6_USD': 'Inward Debt Instruments Positions (Net), US Dollars'}

In [27]:
### IMF CDIS: REPORTED DIRECT INVESTMENT NET VOLUMES RETRIEVING

gc.collect()
### List of bilateral dataframes for future concatenation:
list_cdis_bilateral = [] 
### Beggining of request URL:
str_cdis_const_url = str_imf_base_url + str_imf_dataset_add + str_imf_cdis_id + '/' 
### Session initializing:
request_session = requests.Session()
### Looping over reporter:
for iter_investor in list_ison_countries:
#for iter_investor in ['AU', 'US']:
    ### Looping over indicator:
    for iter_indicator in dict_to_download:        
        if (iter_indicator[1] == 'O'):
            str_cdis_full_url = str_cdis_const_url + '.'.join([str_cdis_freq, iter_investor, iter_indicator, ''])
        else:
            str_cdis_full_url = str_cdis_const_url + '.'.join([str_cdis_freq, '', iter_indicator, iter_investor])
        obj_cdis_set = request_session.get(str_cdis_full_url)
        ### Data reading as JSON:
        dict_cdis_set = json.loads(obj_cdis_set.text.replace('@OBS_STATUS', '@OBS_VALUE'))
        ### Converting each bilateral dataset to dataframe and it's mungling:
        if ('Series' in dict_cdis_set['CompactData']['DataSet']):
            if isinstance(dict_cdis_set['CompactData']['DataSet']['Series'], list):
                list_series = dict_cdis_set['CompactData']['DataSet']['Series']
            else:
                list_series = [dict_cdis_set['CompactData']['DataSet']['Series']]
            for dict_cdis_pair in list_series:
                if isinstance(dict_cdis_pair['Obs'], list):
                    dict_bilateral = dict_cdis_pair['Obs']
                else:
                    dict_bilateral = [dict_cdis_pair['Obs']]
                df_cdis_bilateral = pd.DataFrame(dict_bilateral)
                if '@OBS_VALUE' in df_cdis_bilateral.columns:
                    df_cdis_bilateral = df_cdis_bilateral[['@TIME_PERIOD', '@OBS_VALUE']]
                    df_cdis_bilateral.columns = ['Date', 'Value']
                    df_cdis_bilateral = df_cdis_bilateral.assign(Indicator = dict_cdis_pair['@INDICATOR'])
                    df_cdis_bilateral = df_cdis_bilateral.assign(Reporter_ID = dict_cdis_pair['@REF_AREA'])
                    df_cdis_bilateral = df_cdis_bilateral.assign(Partner_ID = dict_cdis_pair['@COUNTERPART_AREA'])
                    list_cdis_bilateral.append(df_cdis_bilateral)  
        else:
            print('No data in response of the next request:\n', str_cdis_full_url)
        time.sleep(int_seconds_to_sleep)                    
#        break            
    print(iter_investor, ': loading completed')
#    break
### Bilateral datasets aggregating:
df_cdis_raw = pd.concat(list_cdis_bilateral, axis = 0, ignore_index = True, sort = False)
df_cdis_raw['Date'] = pd.to_datetime(df_cdis_raw['Date']) + pd.offsets.BYearEnd()
df_cdis_raw.loc[df_cdis_raw['Value'] == 'C', 'Value'] = np.NaN
df_cdis_raw.loc[df_cdis_raw['Value'] == '-', 'Value'] = np.NaN
df_cdis_raw = df_cdis_raw[df_cdis_raw['Reporter_ID'] != df_cdis_raw['Partner_ID']]
df_cdis_raw = df_cdis_raw[df_cdis_raw['Partner_ID'].isin(df_country_codes['ISO SHORT'].values)]
df_cdis_raw.rename({'Reporter_ID': 'Reporter', 'Partner_ID': 'Partner'}, axis = 1, inplace = True)
df_cdis_raw = df_cdis_raw.astype({'Indicator': 'str', 'Reporter': 'str', 'Partner': 'str', 
                                  'Value': 'float32'})
#df_cdis_raw['Value'].clip(lower = 0.0, inplace = True)
df_cdis_raw['Indicator'].replace(dict_to_download, inplace = True)
df_cdis_raw['Direction'] = df_cdis_raw['Indicator'].str.partition(' ')[0]
df_cdis_raw['Type'] = df_cdis_raw['Indicator'].str.partition(' ')[2].str.partition(' ')[0]
df_cdis_raw['Account'] = np.NaN
df_cdis_raw.loc[df_cdis_raw['Indicator'].str.contains('(Net)'), 'Account'] = 'Net'
df_cdis_raw.loc[df_cdis_raw['Indicator'].str.contains('Asset'), 'Account'] = 'Asset'
df_cdis_raw.loc[df_cdis_raw['Indicator'].str.contains('Liabilit'), 'Account'] = 'Liability'
df_cdis_raw[df_cdis_raw['Reporter'] != df_cdis_raw['Partner']]
ser_cdis_raw = df_cdis_raw.set_index(['Type', 'Direction', 'Account', 'Date', 'Reporter', 'Partner'])['Value'].sort_index()
ser_cdis_raw.name = 'CDIS Positions'

del df_cdis_raw
gc.collect()

No data in response of the next request:
 http://dataservices.imf.org/REST/SDMX_JSON.svc/CompactData/CDIS/A.AU.IOWDA_BP6_USD.
No data in response of the next request:
 http://dataservices.imf.org/REST/SDMX_JSON.svc/CompactData/CDIS/A.AU.IOWDL_BP6_USD.
AU : loading completed
US : loading completed


  return func(self, *args, **kwargs)


75

In [28]:
### IMF CDIS: REPLACING EMPTY POSITION VALUES WITH NET

df_cdis_pos_acc = ser_cdis_raw.unstack('Account')

idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Outward') & (df_cdis_pos_acc['Net'] < 0.0) & \
            (df_cdis_pos_acc['Asset'].isna() & df_cdis_pos_acc['Liability'].isna())
df_cdis_pos_acc.loc[idx_fill, 'Liability'] = -df_cdis_pos_acc.loc[idx_fill, 'Net'].values
idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Outward') & (df_cdis_pos_acc['Net'] >= 0.0) & \
            (df_cdis_pos_acc['Asset'].isna() & df_cdis_pos_acc['Liability'].isna())
df_cdis_pos_acc.loc[idx_fill, 'Asset'] = df_cdis_pos_acc.loc[idx_fill, 'Net'].values
idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Inward') & (df_cdis_pos_acc['Net'] < 0.0) & \
            (df_cdis_pos_acc['Asset'].isna() & df_cdis_pos_acc['Liability'].isna())
df_cdis_pos_acc.loc[idx_fill, 'Asset'] = -df_cdis_pos_acc.loc[idx_fill, 'Net'].values
idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Inward') & (df_cdis_pos_acc['Net'] >= 0.0) & \
            (df_cdis_pos_acc['Asset'].isna() & df_cdis_pos_acc['Liability'].isna())
df_cdis_pos_acc.loc[idx_fill, 'Liability'] = df_cdis_pos_acc.loc[idx_fill, 'Net'].values

idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Outward') & df_cdis_pos_acc['Net'].isna()
df_cdis_pos_acc.loc[idx_fill, 'Net'] = (df_cdis_pos_acc.loc[idx_fill, 'Asset'] - df_cdis_pos_acc.loc[idx_fill, 'Liability']).values
idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Inward') & df_cdis_pos_acc['Net'].isna()
df_cdis_pos_acc.loc[idx_fill, 'Net'] = (df_cdis_pos_acc.loc[idx_fill, 'Liability'] - df_cdis_pos_acc.loc[idx_fill, 'Asset']).values

idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Outward') & df_cdis_pos_acc['Net'].isna() & df_cdis_pos_acc['Asset'].notna()
df_cdis_pos_acc.loc[idx_fill, 'Net'] = df_cdis_pos_acc.loc[idx_fill, 'Asset'].values
idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Outward') & df_cdis_pos_acc['Net'].isna() & df_cdis_pos_acc['Liability'].notna()
df_cdis_pos_acc.loc[idx_fill, 'Net'] = -df_cdis_pos_acc.loc[idx_fill, 'Liability'].values
idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Inward') & df_cdis_pos_acc['Net'].isna() & df_cdis_pos_acc['Asset'].notna()
df_cdis_pos_acc.loc[idx_fill, 'Net'] = -df_cdis_pos_acc.loc[idx_fill, 'Asset'].values
idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Inward') & df_cdis_pos_acc['Net'].isna() & df_cdis_pos_acc['Liability'].notna()
df_cdis_pos_acc.loc[idx_fill, 'Net'] = df_cdis_pos_acc.loc[idx_fill, 'Liability'].values

idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Outward') & df_cdis_pos_acc['Net'].notna() & \
            df_cdis_pos_acc['Asset'].notna() & df_cdis_pos_acc['Liability'].isna()
df_cdis_pos_acc.loc[idx_fill, 'Liability'] = (df_cdis_pos_acc.loc[idx_fill, 'Asset'] - df_cdis_pos_acc.loc[idx_fill, 'Net']).values
idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Outward') & df_cdis_pos_acc['Net'].notna() & \
            df_cdis_pos_acc['Asset'].isna() & df_cdis_pos_acc['Liability'].notna()
df_cdis_pos_acc.loc[idx_fill, 'Asset'] = (df_cdis_pos_acc.loc[idx_fill, 'Net'] + df_cdis_pos_acc.loc[idx_fill, 'Liability']).values
idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Inward') & df_cdis_pos_acc['Net'].notna() & \
            df_cdis_pos_acc['Asset'].notna() & df_cdis_pos_acc['Liability'].isna()
df_cdis_pos_acc.loc[idx_fill, 'Liability'] = (df_cdis_pos_acc.loc[idx_fill, 'Net'] + df_cdis_pos_acc.loc[idx_fill, 'Asset']).values
idx_fill = (df_cdis_pos_acc.index.get_level_values('Direction') == 'Inward') & df_cdis_pos_acc['Net'].notna() & \
            df_cdis_pos_acc['Asset'].isna() & df_cdis_pos_acc['Liability'].notna()
df_cdis_pos_acc.loc[idx_fill, 'Asset'] = (df_cdis_pos_acc.loc[idx_fill, 'Liability'] - df_cdis_pos_acc.loc[idx_fill, 'Net']).values

ser_cdis_full = df_cdis_pos_acc.stack('Account', dropna = False).reorder_levels([0, 1, 5, 2, 3, 4]).sort_index()

del df_cdis_pos_acc
gc.collect()

0

In [29]:
### IMF CDIS: FDI POSITION DATASETS SAVING

ser_debt_do = ser_cdis_full.loc['Debt', 'Outward', 'Asset', :, list_ison_countries, df_country_codes['ISO SHORT'].values]\
.droplevel(['Type', 'Direction', 'Account']).sort_index()
#ser_debt_do.to_hdf(path_or_buf = str_path_imf_cdis_dataset, key = str_key_do_debt_imf_cdis_dataset, mode = 'w', format = 'fixed')
ser_debt_di = ser_cdis_full.loc['Debt', 'Inward', 'Liability', :, df_country_codes['ISO SHORT'].values, list_ison_countries]\
                           .droplevel(['Type', 'Direction', 'Account'])
ser_debt_di.index.names = ['Date', 'Partner', 'Reporter']
ser_debt_di = ser_debt_di.reorder_levels(['Date', 'Reporter', 'Partner']).sort_index()
#ser_debt_di.to_hdf(path_or_buf = str_path_imf_cdis_dataset, key = str_key_di_debt_imf_cdis_dataset, mode = 'a', format = 'fixed')

In [14]:
### IMF CDIS: DEBT DATA AGGREGATION: DATASETS LOADING

gc.collect()
ser_cdis_asset = pd.read_hdf(path_or_buf = str_path_imf_cdis_dataset, key = str_key_do_debt_imf_cdis_dataset)
ser_cdis_asset[ser_cdis_asset < 0.0] = 0.0
ser_cdis_asset.name = 'Asset'
ser_cdis_liability_inv = pd.read_hdf(path_or_buf = str_path_imf_cdis_dataset, key = str_key_di_debt_imf_cdis_dataset)
ser_cdis_liability_inv[ser_cdis_liability_inv < 0.0] = 0.0
ser_cdis_liability_inv.name = 'Liability_Inverted'
df_cdis_debt = pd.concat([ser_cdis_asset, ser_cdis_liability_inv], axis = 1, names = 'Data Source').astype('float32').round(2)

In [36]:
### IMF CDIS: UNCONDITIONAL COMBINATION

### Combining Export & Import data:
df_cdis_debt.loc[df_cdis_debt['Asset'] == 0.0, 'Asset'] = np.NaN 
ser_cdis_augmented = df_cdis_debt['Asset'].combine_first(df_cdis_debt['Liability_Inverted']).replace({0.0: np.NaN})

In [37]:
### NEED TO BE ADOPTED: IMF CDIS: RESULTS SAVING

ser_cdis_augmented.to_hdf(path_or_buf = str_path_imf_cdis_augmented, key = str_key_imf_cdis_augmented, mode = 'w', format = 'fixed')

In [38]:
### IMF CDIS: RESULTS CHECK

ser_cdis_test_new = pd.read_hdf(path_or_buf = str_path_imf_cdis_augmented, key = str_key_imf_cdis_augmented)
ser_cdis_test_new.name = 'New'
ser_cdis_test_old = pd.read_hdf(path_or_buf = 'Data_Files/Source_Files/cdis_debt_assets.h5')['Unconditional'].replace({0: np.NaN})
ser_cdis_test_old.name = 'Old'
display(pd.concat([ser_cdis_test_new.describe(), ser_cdis_test_old.describe()], axis = 1))
print(str(len(ser_cdis_test_new)), '~', str(len(ser_cdis_test_old)))

Unnamed: 0,New,Old
count,62443.0,62443.0
mean,1094.594482,1094.59436
std,7045.642578,7045.642578
min,0.01,0.01
25%,2.26,2.26
50%,23.030001,23.030001
75%,201.679993,201.679993
max,223293.515625,223293.515625


178789 ~ 181876


In [6]:
### OECD FDI: FOREIGN DIRECT INVESTMENT (MILLIONS OF USD)

In [7]:
### NEED TO BE ADOPTED: OECD FDI: DATA SAVING PARAMETERS

### Filtered dataset:
str_path_oecd_fdi_dataset = 'Data_Files/Source_Files/oecd_assets.h5'
str_key_do_total_oecd_fdi_dataset = 'fdi_total_outward_assets'
str_key_di_total_oecd_fdi_dataset = 'fdi_total_inward_assets'
### Resulting dataset:
str_path_oecd_fdi_augmented = 'Data_Files/Source_Files/oecd_augmented_unconditional.h5'
str_key_oecd_fdi_augmented = 'oecd_asset_augmented'

In [8]:
### OECD FDI: LOADING PARAMETERS PREPARATION

### Constants:
All = slice(None)
date_start = pd.Timestamp('1989-12-29')
date_end = pd.Timestamp('2022-10-31')

str_oecd_base_url = 'https://stats.oecd.org/sdmx-json/data/'
str_oecd_structure_url = 'https://stats.oecd.org/restsdmx/sdmx.ashx/GetDataStructure/'
str_fdi_pos_dataset_add = 'FDI_POS_CTRY'
### Currency:
str_measure = 'USD'
### Direction:
str_direction = '+'.join(['DI', 'DO']) # 'DO' # 
### Investment type:
str_fdi_type = '+'.join(['LE_FA_F']) # 'LE_FA_F5' # 
### Residence defining:
str_residence = 'ALL'
### Accounting way:
str_accounting =  '+'.join(['A', 'NET', 'L']) # '+'.join(['A', 'L']) # 'NET' # 
### Level counterpart(???):
str_counterpart = 'IMC'

In [9]:
### OECD FDI: FDI POSITION REQUEST CONSTRUCTING

### Session initializing:
request_session = requests.Session()

str_fdi_pos_request_params = '.'.join(['', str_measure, str_direction, str_fdi_type, str_residence, str_accounting, str_counterpart, ''])
str_fdi_pos_request = str_oecd_base_url + str_fdi_pos_dataset_add + '/' + str_fdi_pos_request_params + '/all?startTime=' + str(date_start.year) + \
                      '&endTime=' + str(date_end.year) + '&detail=DataOnly'
obj_fdi_pos_dataset = request_session.get(str_fdi_pos_request).json()

In [10]:
### OECD FDI: FDI POSITION INDEX DATA COLLECTING:

### Dates:
list_idx_dates = []
for tup_date in obj_fdi_pos_dataset['structure']['dimensions']['observation'][0]['values']:
    list_idx_dates.append(pd.to_datetime(tup_date['id']) + pd.offsets.BYearEnd())
### Parameters:    
list_idx_library = []
for iter_position in obj_fdi_pos_dataset['structure']['dimensions']['series']:
    list_param_values = []
    for tup_parameter in iter_position['values']:
        list_param_values.append(tup_parameter['id'])            
    list_idx_library.append(list_param_values)
### Result:
list_idx_library.append(list_idx_dates)
### Converting to dictionary for future replacing:
list_idx_dict = []
for iter_list in list_idx_library:
    list_idx_dict.append(dict(zip(map(str, range(len(iter_list))), iter_list)))

In [11]:
### OECD FDI: FDI POSITION DATASET RESAMPLING

dict_datasets_res = {}
dict_datasets_source = obj_fdi_pos_dataset['dataSets'][0]['series']
### Parameters and date indexes integration:
for iter_dataset in dict_datasets_source:
    dict_observations = dict_datasets_source[iter_dataset]['observations']
    for iter_observation in dict_observations:
        str_iter_idx = iter_dataset + ':' + iter_observation
        flo_iter_value = dict_observations[iter_observation][0]
        dict_datasets_res[str_iter_idx] = flo_iter_value

In [12]:
### OECD FDI: FDI POSITION DATASET REINDEXATION

gc.collect()
df_fdi_pos_data = pd.Series(dict_datasets_res)
df_fdi_pos_data.index = pd.MultiIndex.from_arrays(zip(*df_fdi_pos_data.index.str.split(':')))
int_levels_number = df_fdi_pos_data.index.nlevels
df_fdi_pos_data = df_fdi_pos_data.reset_index()
### Replacing numbers with parameter values:
for iter_level in range(int_levels_number):
    df_fdi_pos_data['level_' + str(iter_level)].replace(list_idx_dict[iter_level], inplace = True)
    ### Replacing long ISO names with short ISO names:
    if (iter_level == 0):
        df_fdi_pos_data['level_' + str(iter_level)].replace(dict(zip(df_country_codes['ISO LONG'].values, df_country_codes['ISO SHORT'].values)), inplace = True)
    elif (iter_level == 7):
        df_fdi_pos_data['level_' + str(iter_level)].replace(dict(zip(df_country_codes['ISO LONG'].values, df_country_codes['ISO SHORT'].values)), inplace = True)
    ### Directions renaming:
    elif (iter_level == 2):
        df_fdi_pos_data['level_' + str(iter_level)].replace({'DI': 'Inward', 'DO': 'Outward'}, inplace = True)
    ### Investment types renaming:
    elif (iter_level == 3):
        df_fdi_pos_data['level_' + str(iter_level)].replace({'LE_FA_F': 'Total', 'LE_FA_F5': 'Equity'}, inplace = True)         
    ### Flow types renaming:
    elif (iter_level == 5):
        df_fdi_pos_data['level_' + str(iter_level)].replace({'NET': 'Net', 'A': 'Asset', 'L': 'Liability'}, inplace = True)      

### Intergated observations dropping:
df_fdi_pos_data = df_fdi_pos_data.loc[
                                      df_fdi_pos_data['level_0'].isin(df_country_codes['ISO SHORT'].values) & 
                                      df_fdi_pos_data['level_7'].isin(df_country_codes['ISO SHORT'].values)
                                     ]
### Indexes defining:
ser_fdi_pos_data = df_fdi_pos_data.drop(['level_1', 'level_4', 'level_6'], axis = 1)\
                    .set_index(['level_3', 'level_2', 'level_5', 'level_8', 'level_0', 'level_7']).squeeze()
ser_fdi_pos_data.index.names = ['Type', 'Direction', 'Account', 'Date', 'Reporter', 'Partner']
ser_fdi_pos_data.sort_index(inplace = True)
ser_fdi_pos_data = ser_fdi_pos_data[ser_fdi_pos_data.index.get_level_values('Reporter') != ser_fdi_pos_data.index.get_level_values('Partner')]
#ser_fdi_pos_data[ser_fdi_pos_data < 0.0] = 0.0
ser_fdi_pos_data.name = 'FDI Positions'

In [13]:
### OECD FDI: REPLACING EMPTY POSITION VALUES WITH NET

df_fdi_pos_acc = ser_fdi_pos_data.unstack('Account')

idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Outward') & (df_fdi_pos_acc['Net'] < 0.0) & \
            (df_fdi_pos_acc['Asset'].isna() & df_fdi_pos_acc['Liability'].isna())
df_fdi_pos_acc.loc[idx_fill, 'Liability'] = -df_fdi_pos_acc.loc[idx_fill, 'Net'].values
idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Outward') & (df_fdi_pos_acc['Net'] >= 0.0) & \
            (df_fdi_pos_acc['Asset'].isna() & df_fdi_pos_acc['Liability'].isna())
df_fdi_pos_acc.loc[idx_fill, 'Asset'] = df_fdi_pos_acc.loc[idx_fill, 'Net'].values
idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Inward') & (df_fdi_pos_acc['Net'] < 0.0) & \
            (df_fdi_pos_acc['Asset'].isna() & df_fdi_pos_acc['Liability'].isna())
df_fdi_pos_acc.loc[idx_fill, 'Asset'] = -df_fdi_pos_acc.loc[idx_fill, 'Net'].values
idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Inward') & (df_fdi_pos_acc['Net'] >= 0.0) & \
            (df_fdi_pos_acc['Asset'].isna() & df_fdi_pos_acc['Liability'].isna())
df_fdi_pos_acc.loc[idx_fill, 'Liability'] = df_fdi_pos_acc.loc[idx_fill, 'Net'].values

idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Outward') & df_fdi_pos_acc['Net'].isna()
df_fdi_pos_acc.loc[idx_fill, 'Net'] = (df_fdi_pos_acc.loc[idx_fill, 'Asset'] - df_fdi_pos_acc.loc[idx_fill, 'Liability']).values
idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Inward') & df_fdi_pos_acc['Net'].isna()
df_fdi_pos_acc.loc[idx_fill, 'Net'] = (df_fdi_pos_acc.loc[idx_fill, 'Liability'] - df_fdi_pos_acc.loc[idx_fill, 'Asset']).values

idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Outward') & df_fdi_pos_acc['Net'].isna() & df_fdi_pos_acc['Asset'].notna()
df_fdi_pos_acc.loc[idx_fill, 'Net'] = df_fdi_pos_acc.loc[idx_fill, 'Asset'].values
idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Outward') & df_fdi_pos_acc['Net'].isna() & df_fdi_pos_acc['Liability'].notna()
df_fdi_pos_acc.loc[idx_fill, 'Net'] = -df_fdi_pos_acc.loc[idx_fill, 'Liability'].values
idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Inward') & df_fdi_pos_acc['Net'].isna() & df_fdi_pos_acc['Asset'].notna()
df_fdi_pos_acc.loc[idx_fill, 'Net'] = -df_fdi_pos_acc.loc[idx_fill, 'Asset'].values
idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Inward') & df_fdi_pos_acc['Net'].isna() & df_fdi_pos_acc['Liability'].notna()
df_fdi_pos_acc.loc[idx_fill, 'Net'] = df_fdi_pos_acc.loc[idx_fill, 'Liability'].values

idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Outward') & df_fdi_pos_acc['Net'].notna() & \
            df_fdi_pos_acc['Asset'].notna() & df_fdi_pos_acc['Liability'].isna()
df_fdi_pos_acc.loc[idx_fill, 'Liability'] = (df_fdi_pos_acc.loc[idx_fill, 'Asset'] - df_fdi_pos_acc.loc[idx_fill, 'Net']).values
idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Outward') & df_fdi_pos_acc['Net'].notna() & \
            df_fdi_pos_acc['Asset'].isna() & df_fdi_pos_acc['Liability'].notna()
df_fdi_pos_acc.loc[idx_fill, 'Asset'] = (df_fdi_pos_acc.loc[idx_fill, 'Net'] + df_fdi_pos_acc.loc[idx_fill, 'Liability']).values
idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Inward') & df_fdi_pos_acc['Net'].notna() & \
            df_fdi_pos_acc['Asset'].notna() & df_fdi_pos_acc['Liability'].isna()
df_fdi_pos_acc.loc[idx_fill, 'Liability'] = (df_fdi_pos_acc.loc[idx_fill, 'Net'] + df_fdi_pos_acc.loc[idx_fill, 'Asset']).values
idx_fill = (df_fdi_pos_acc.index.get_level_values('Direction') == 'Inward') & df_fdi_pos_acc['Net'].notna() & \
            df_fdi_pos_acc['Asset'].isna() & df_fdi_pos_acc['Liability'].notna()
df_fdi_pos_acc.loc[idx_fill, 'Asset'] = (df_fdi_pos_acc.loc[idx_fill, 'Liability'] - df_fdi_pos_acc.loc[idx_fill, 'Net']).values

ser_fdi_pos_data = df_fdi_pos_acc.stack('Account', dropna = False).reorder_levels([0, 1, 5, 2, 3, 4]).sort_index()
ser_fdi_pos_data[ser_fdi_pos_data < 0.0] = 0.0

del df_fdi_pos_acc
gc.collect()

17

In [14]:
### OECD FDI: FDI POSITION DATASETS SAVING

ser_total_do = ser_fdi_pos_data.loc['Total', 'Outward', 'Asset', :, list_ison_countries, :].droplevel(['Type', 'Direction', 'Account']).sort_index()
#ser_total_do.to_hdf(path_or_buf = str_path_oecd_fdi_dataset, key = str_key_do_total_oecd_fdi_dataset, mode = 'w', format = 'fixed')
ser_total_di = ser_fdi_pos_data.loc['Total', 'Inward', 'Liability', :, :, list_ison_countries].droplevel(['Type', 'Direction', 'Account'])
ser_total_di.index.names = ['Date', 'Partner', 'Reporter']
ser_total_di = ser_total_di.reorder_levels(['Date', 'Reporter', 'Partner']).sort_index()
#ser_total_di.to_hdf(path_or_buf = str_path_oecd_fdi_dataset, key = str_key_di_total_oecd_fdi_dataset, mode = 'a', format = 'fixed')

In [18]:
### NEED TO BE ADOPTED: OECD FDI: TOTAL DATA AGGREGATION: DATASETS LOADING

gc.collect()
ser_oecd_asset = pd.read_hdf(path_or_buf = str_path_oecd_fdi_dataset, key = str_key_do_total_oecd_fdi_dataset)
ser_oecd_asset.name = 'Asset'
ser_oecd_liability_inv = pd.read_hdf(path_or_buf = str_path_oecd_fdi_dataset, key = str_key_di_total_oecd_fdi_dataset)
ser_oecd_liability_inv.name = 'Liability_Inverted'
df_oecd_total = pd.concat([ser_oecd_asset, ser_oecd_liability_inv], axis = 1, names = 'Data Source').astype('float32').round(2)

In [20]:
### OECD FDI: UNCONDITIONAL COMBINATION

### Combining Export & Import data:
df_oecd_total.loc[df_oecd_total['Asset'] == 0.0, 'Asset'] = np.NaN 
ser_oecd_augmented = df_oecd_total['Asset'].combine_first(df_oecd_total['Liability_Inverted'])

In [21]:
### NEED TO BE ADOPTED: OECD FDI: RESULTS SAVING

ser_oecd_augmented.to_hdf(path_or_buf = str_path_oecd_fdi_augmented, key = str_key_oecd_fdi_augmented, mode = 'w', format = 'fixed')

In [8]:
### OECD FDI: RESULTS CHECK

ser_oecd_test_new = pd.read_hdf(path_or_buf = str_path_oecd_fdi_augmented, key = str_key_oecd_fdi_augmented)
ser_oecd_test_new.name = 'New'
ser_oecd_test_old = pd.read_hdf(path_or_buf = 'Data_Files/Source_Files/oecd_total_assets.h5')['Unconditional']
ser_oecd_test_old.name = 'Old'
display(pd.concat([ser_oecd_test_new.describe(), ser_oecd_test_old.describe()], axis = 1))
print(str(len(ser_oecd_test_new)), '~', str(len(ser_oecd_test_old)))

Unnamed: 0,New,Old
count,49519.0,49519.0
mean,6098.917,6098.917
std,38129.8,38129.8
min,0.0,0.0
25%,1.18,1.18
50%,50.62,50.62
75%,853.18,853.18
max,1081692.0,1081692.0


104318 ~ 104318


In [6]:
### GRAVITY DATASET

In [7]:
### NEED TO BE ADOPTED: GRAVITY: DATA SAVING PARAMETERS

### GDP Dataset:
str_path_wb_gdp_dataset = 'Data_Files/Source_Files/gdp_dataset.h5'
str_wb_gdp_dataset = 'gdp_dataset'
### CEPII Dataste:
str_path_cepii_dataset = 'Data_Files/Source_Files/cepii_dataset.h5'
str_distance_dataset = 'distance_dataset'
### Resulting dataset:
str_path_gravity = 'Data_Files/Source_Files/gravity_constructed.h5'
str_key_gravity = 'gravity'

In [8]:
### CEPII DISTANCES: LOADING PARAMETERS PREPARATION

### MultiIndex level slice constant:
All = slice(None)
### Path to MS Excel file:
str_path_cepii_source = 'Data_Files/Source_Files/CEPII Distance Data/dist_cepii.xls'
### Saved dataset:
str_path_cepii_dataset = 'Data_Files/Source_Files/cepii_dataset.h5'
str_distance_dataset = 'distance_dataset'

In [9]:
### CEPII DISTANCES: DATA EXPORT AND REPACKING

### Constants:
str_path_cepii_source = 'Data_Files/Source_Files/CEPII Distance Data/dist_cepii.xls'
### Source data export:
df_distance_source = pd.read_excel(str_path_cepii_source, index_col = [0, 1])
### Long to Short Country ID's converting:
df_distance_data = df_distance_source.join(df_country_codes.set_index('ISO LONG').squeeze(), on = 'iso_o')
df_distance_data.rename({'ISO SHORT': 'From_ID'}, axis = 1, inplace = True)
df_distance_data = df_distance_data.join(df_country_codes.set_index('ISO LONG').squeeze(), on = 'iso_d')
df_distance_data.rename({'ISO SHORT': 'To_ID'}, axis = 1, inplace = True)
### ISON countries filtering:
df_distance_data = df_distance_data.dropna().set_index(['From_ID', 'To_ID']).loc[(list_ison_countries, list_ison_countries), ['dist', 'distcap', 'distw', 'distwces']]
df_distance_data = df_distance_data.astype(int)
### Result saving:
df_distance_data.to_hdf(path_or_buf = str_path_cepii_dataset, key = str_distance_dataset, mode = 'w')

In [11]:
### WORLD BANK: WDI: GDP: GENERAL DATA PREPARATION

### MultiIndex level slice constant:
All = slice(None)

str_wdi_base_url = 'http://api.worldbank.org/v2/'
str_wdi_request_format = '?format=json&per_page=29999'
str_gdp_dataset = 'NY.GDP.MKTP.CD'

In [16]:
### WORLD BANK: WDI: GDP: DATA EXTRACTING

### Session initializing:
request_session = requests.Session()
### List of ISON countries converting:
str_reporters_all = ';'.join(sorted(list_ison_long))
### URL for API request:
str_gdp_url = str_wdi_base_url + 'country/' + str_reporters_all + '/indicator/' + str_gdp_dataset + \
              str_wdi_request_format
### API response:
obj_gdp_dataset = request_session.get(str_gdp_url)
### Data converting from JSON to pandas:
ser_country_id = pd.DataFrame(obj_gdp_dataset.json()[1])['country'].apply(pd.Series)['id']
df_raw_dataset = pd.concat([ser_country_id, pd.DataFrame(obj_gdp_dataset.json()[1])[['date', 'value']]], axis = 1)
df_raw_dataset.columns = ['Country', 'Year', 'Value']
df_raw_dataset['Date'] = pd.to_datetime(df_raw_dataset['Year']) + pd.offsets.BYearEnd()
### Adding data to container:
ser_full_gdp = df_raw_dataset.set_index(['Date', 'Country'])['Value'].sort_index()
### Data saving:
ser_full_gdp.to_hdf(path_or_buf = str_path_wb_gdp_dataset, key = str_wb_gdp_dataset, mode = 'w')

In [18]:
### NEED TO BE ADOPTED: DATASETS LOADING:

### WB WDI GDP dataset:
ser_gdp = pd.read_hdf(path_or_buf = str_path_wb_gdp_dataset, key = str_wb_gdp_dataset)
### CEPII Distances dataset:
ser_dist = pd.read_hdf(path_or_buf = str_path_cepii_dataset, key = str_distance_dataset)['distw']

In [19]:
### GRAVITY: DATASET CONSTRUCTION

### Construction constants:
flo_dist_power = 1
### Distances naming:
ser_dist.index.names = ['Reporter', 'Partner']
ser_dist.name = 'Distance'
### Dropping internal distances:
df_dist = ser_dist.reset_index()
df_dist.drop(df_dist[df_dist['Reporter'] == df_dist['Partner']].index, inplace = True)
ser_dist_cleared = df_dist.set_index(['Reporter', 'Partner']).squeeze().sort_index()
### GDP duplicating:
ser_gdp_reporter = ser_gdp[:]
ser_gdp_reporter.index.names = ['Date', 'Reporter']
ser_gdp_reporter.name = 'GDP_Reporter'
ser_gdp_partner = ser_gdp[:]
ser_gdp_partner.index.names = ['Date', 'Partner']
ser_gdp_partner.name = 'GDP_Partner'
### Reporters data connecting:
df_reporter = ser_dist_cleared.to_frame().join(ser_gdp_reporter).sort_index()
### Partners data connecting:
df_partner = ser_dist_cleared.to_frame().join(ser_gdp_partner).drop('Distance', axis = 1).sort_index()
df_partner = df_partner.reorder_levels([1, 0, 2])
### Joining data and Gravity calculation:
df_gravity = pd.concat([df_reporter, df_partner], axis = 1)
df_gravity = df_gravity.reset_index('Date').dropna(subset = ['Date']).set_index('Date', append = True).reorder_levels([2, 0, 1]).sort_index()
ser_gravity = (df_gravity['GDP_Reporter'] / 10 ** 9) * (df_gravity['GDP_Partner'] / 10 ** 9) / (df_gravity['Distance'] ** flo_dist_power)

In [20]:
### NEED TO BE ADOPTED: GRAVITY: RESULTS SAVING

ser_gravity.to_hdf(path_or_buf = str_path_gravity, key = str_key_gravity, mode = 'w', format = 'fixed')