In [1]:
### IMF CDIS: BILATERAL EQUITY & DEBT INVESTMENT POSITIONS

In [2]:
### RUN EVERY TIME: INITIALIZATION

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1) ### To display long strings
import math
import requests
import json ### To correct JSON structure before unpacking
import gc
import os
import datetime
import time
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import seaborn as sns
#%load_ext line_profiler

In [3]:
### RUN EVERY TIME: VERSION CONTROL

from platform import python_version
print('pandas version: ', pd.__version__)
print('python version: ', python_version())

pandas version:  0.25.3
python version:  3.7.4


In [6]:
### RUN EVERY TIME: MAIN CONSTANTS

### MultiIndex level slice constant:
All = slice(None)
### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### IMF CDIS datasets:
str_path_imf_cdis_dataset = 'Data_Files/Source_Files/cdis_dataset.h5'
str_key_do_total_imf_cdis_dataset = 'cdis_total_outward_dataset'
str_key_di_total_imf_cdis_dataset = 'cdis_total_inward_dataset'
str_key_do_equity_imf_cdis_dataset = 'cdis_equity_outward_dataset'
str_key_di_equity_imf_cdis_dataset = 'cdis_equity_inward_dataset'
str_path_imf_cdis_augmented = 'Data_Files/Source_Files/cdis_augmented.h5'
str_key_do_total_imf_cdis_augmented = 'cdis_total_outward_augmented'
str_key_do_equity_imf_cdis_augmented = 'cdis_equity_outward_augmented'
str_path_imf_cdis_options = 'Data_Files/Source_Files/cdis_options.h5'
str_key_total_imf_cdis_options = 'cdis_total_outward_options'
str_key_equity_imf_cdis_options = 'cdis_equity_outward_options'
### OECD FDI datasets:
str_path_oecd_fdi_augmented = 'Data_Files/Source_Files/oecd_augmented.h5'
str_key_do_total_oecd_fdi_augmented = 'fdi_total_outward_augmented'
str_key_do_equity_oecd_fdi_augmented = 'fdi_equity_outward_augmented'
str_path_oecd_fdi_options = 'Data_Files/Source_Files/oecd_options.h5'
str_key_total_oecd_fdi_options = 'fdi_total_outward_options'
str_key_equity_oecd_fdi_options = 'fdi_equity_outward_options'
### Combined datasets:
str_path_direct_total_augmented = 'Data_Files/Source_Files/direct_total_augmented.h5'
str_path_direct_equity_augmented = 'Data_Files/Source_Files/direct_equity_augmented.h5'
str_key_direct_augmented = 'direct_augmented'
str_path_total_direct_options = 'Data_Files/Source_Files/direct_total_options.h5'
str_key_total_direct_options = 'direct_total_options'
str_path_equity_direct_options = 'Data_Files/Source_Files/direct_equity_options.h5'
str_key_equity_direct_options = 'direct_equity_options'
### Technical Constants:
str_date_end = '2022-10-31'
date_start = pd.Timestamp('1989-12-29')
date_end = pd.Timestamp(str_date_end)
date_ison = pd.Timestamp('1994-12-31')

In [7]:
### DEFINING COUNTRY CODES EXTRACTOR

def get_country_codes(use_local_copy = False):  
    ### In case if URL is unavailable:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/countrycode.html'
    ### Online extraction:
    else:
        url_country_code = 'https://countrycode.org/'
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    df_result = df_full_codes[['ISO SHORT', 'ISO LONG']].sort_index()    
    df_result.index = df_result.index.str.upper()
    ### Results output:
    return df_result

In [8]:
### DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE (TO BE IGNORED IN PRODUCT CODE)

def ison_membership_converting(str_path_universe, date_end, bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(engine = 'openpyxl', io = str_path_universe, sheet_name = 'Switchers', header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

In [9]:
### RUN EVERY TIME: COMMON DATA EXTRACTION STEPS

### World Country Codes:
df_country_codes = get_country_codes()
### ISON membership history:
ser_ison_membership = ison_membership_converting(str_path_universe, pd.to_datetime(str_date_end))
### ISON LONG IDs list:
list_ison_long = list(df_country_codes.loc[df_country_codes['ISO SHORT'].isin(ser_ison_membership.index.get_level_values('Country').unique()), 'ISO LONG'].values)
### ISON current status:
ser_ison_status = ser_ison_membership.loc[str_date_end].droplevel('Date')
### ISON stats:
int_ison_number = len(list_ison_long)
list_regions = ['DM', 'EM', 'FM']
dict_ison_len = {}
dict_ison_len['Full Universe'] = int_ison_number
for iter_region in list_regions:
    dict_ison_len[iter_region] = len(ser_ison_status[ser_ison_status == iter_region])
ser_market_len = pd.Series(dict_ison_len)
ser_market_len.index.names = ['Market']    

In [10]:
### IMF CDIS: GENERAL DATA PREPARATION

### Constants:
All = slice(None)
dict_request_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
str_imf_base_url = 'http://dataservices.imf.org/REST/SDMX_JSON.svc/'
str_imf_dataflow_add = 'DataFlow'
str_imf_datastructure_add = 'DataStructure/'
str_imf_codelist_add = 'CodeList/'
str_imf_dataset_add = 'CompactData/'
int_seconds_to_sleep = 1
int_imf_country_limit = 30

In [11]:
### IMF CDIS: REQUESTS SESSION INITIALIZING

request_session = requests.Session()
### For avoiding data request errors from IMF Data Service:
request_session.headers.update(dict_request_headers)

In [12]:
### IMF CDIS: DATAFLOW SEARCHING

obj_imf_dataflow_list = request_session.get(str_imf_base_url + str_imf_dataflow_add).json()
df_imf_dataflow = pd.DataFrame(obj_imf_dataflow_list['Structure']['Dataflows']['Dataflow'])
df_imf_dataflow = df_imf_dataflow.assign(Description = df_imf_dataflow['Name'].apply(pd.Series)['#text'].values)[['@id', 'Description']]
ser_imf_dataflow = df_imf_dataflow.set_index('@id', drop = True).squeeze()
### Searching DataFlow code for further requests:
str_imf_cdis_id = ser_imf_dataflow[ser_imf_dataflow.str.contains('CDIS')].index[0].replace('DS-', '')
print(str_imf_cdis_id)

CDIS


In [13]:
### IMF CDIS: DATASTRUCTURE SEARCHING

obj_imf_cdis_structure = request_session.get(str_imf_base_url + str_imf_datastructure_add + str_imf_cdis_id).json()
df_imf_cdis_params = pd.DataFrame(obj_imf_cdis_structure['Structure']['KeyFamilies']['KeyFamily']['Components']['Dimension'])\
                                [['@conceptRef', '@codelist', '@isFrequencyDimension']]
### Receiving DataFlow parameters and code lists for each of them:
print(df_imf_cdis_params)

        @conceptRef          @codelist @isFrequencyDimension
0  FREQ              CL_FREQ            true                
1  REF_AREA          CL_AREA_CDIS       NaN                 
2  INDICATOR         CL_INDICATOR_CDIS  NaN                 
3  COUNTERPART_AREA  CL_AREA_CDIS       NaN                 


In [14]:
### IMF CDIS: CODES DESCRIPTIONS LOADING

for int_counter, str_param_code in enumerate(df_imf_cdis_params['@codelist']):
    if (int_counter == 2):
        time.sleep(int_seconds_to_sleep)    
        obj_imf_cdis_param = request_session.get(str_imf_base_url + str_imf_codelist_add + str_param_code).json()
        df_imf_cdis_param =  pd.DataFrame(obj_imf_cdis_param['Structure']['CodeLists']['CodeList']['Code'])
        ### Receiving values for each code list:
        df_imf_cdis_param = df_imf_cdis_param.assign(Text = df_imf_cdis_param['Description'].apply(pd.Series)['#text'].values)[['@value', 'Text']]
        dict_indicator = dict(zip(df_imf_cdis_param['@value'], df_imf_cdis_param['Text']))
        
list_ison_countries = sorted(list(map(str, ser_ison_membership.index.get_level_values(1).unique())))
str_cdis_freq = 'A' # 'B' # 

In [19]:
### TEMP

dict_to_download = {iter_key: dict_indicator[iter_key] for iter_key in \
                    ('IOW_BP6_USD', 'IIW_BP6_USD', 'IOWE_BP6_USD', 'IIWE_BP6_USD')}
pd.Series(dict_indicator).to_excel('Data_Files/Test_Files/IMF_CDIS_Indicators.xlsx')

In [None]:
### IMF CDIS: REPORTED DIRECT INVESTMENT NET VOLUMES RETRIEVING

gc.collect()
### Extracting needed part of indicators:
dict_to_download = {iter_key: dict_indicator[iter_key] for iter_key in ('IOW_BP6_USD', 'IIW_BP6_USD', 'IOWE_BP6_USD', 'IIWE_BP6_USD')}
### List of bilateral dataframes for future concatenation:
list_cdis_bilateral = [] 
### Beggining of request URL:
str_cdis_const_url = str_imf_base_url + str_imf_dataset_add + str_imf_cdis_id + '/' 
### Looping over reporter:
for iter_investor in list_ison_countries:
#for iter_investor in ['CA']:  
    ### Looping over indicator:
    for iter_indicator in dict_to_download:        
#    for iter_indicator in ['IOWE_BP6_USD', 'IIWE_BP6_USD']:        
        if (iter_indicator[1] == 'O'):
            str_cdis_full_url = str_cdis_const_url + '.'.join([str_cdis_freq, iter_investor, iter_indicator, ''])
        else:
            str_cdis_full_url = str_cdis_const_url + '.'.join([str_cdis_freq, '', iter_indicator, iter_investor])            
        obj_cdis_set = request_session.get(str_cdis_full_url)
        ### Data reading as JSON:
        dict_cdis_set = json.loads(obj_cdis_set.text.replace('@OBS_STATUS', '@OBS_VALUE'))
        ### Converting each bilateral dataset to dataframe and it's mungling:
        if ('Series' in dict_cdis_set['CompactData']['DataSet']):
            if isinstance(dict_cdis_set['CompactData']['DataSet']['Series'], list):
                list_series = dict_cdis_set['CompactData']['DataSet']['Series']
            else:
                list_series = [dict_cdis_set['CompactData']['DataSet']['Series']]
            for dict_cdis_pair in list_series:
                if isinstance(dict_cdis_pair['Obs'], list):
                    dict_bilateral = dict_cdis_pair['Obs']
                else:
                    dict_bilateral = [dict_cdis_pair['Obs']]
                df_cdis_bilateral = pd.DataFrame(dict_bilateral)
                if '@OBS_VALUE' in df_cdis_bilateral.columns:
                    df_cdis_bilateral = df_cdis_bilateral[['@TIME_PERIOD', '@OBS_VALUE']]
                    df_cdis_bilateral.columns = ['Date', 'Value']
                    df_cdis_bilateral = df_cdis_bilateral.assign(Indicator = dict_cdis_pair['@INDICATOR'])
                    df_cdis_bilateral = df_cdis_bilateral.assign(Reporter_ID = dict_cdis_pair['@REF_AREA'])
                    df_cdis_bilateral = df_cdis_bilateral.assign(Partner_ID = dict_cdis_pair['@COUNTERPART_AREA'])
                    list_cdis_bilateral.append(df_cdis_bilateral)  
        else:
            print('No data in response of the next request:\n', str_cdis_full_url)
        time.sleep(int_seconds_to_sleep)                    
#        break            
    print(iter_investor, ': loading completed')
#    break
### Bilateral datasets aggregating:
df_cdis_raw = pd.concat(list_cdis_bilateral, axis = 0, ignore_index = True, sort = False)
df_cdis_raw['Date'] = pd.to_datetime(df_cdis_raw['Date']) + pd.offsets.BYearEnd()
df_cdis_raw.loc[df_cdis_raw['Value'] == 'C', 'Value'] = np.NaN
df_cdis_raw.loc[df_cdis_raw['Value'] == '-', 'Value'] = np.NaN
df_cdis_raw = df_cdis_raw[df_cdis_raw['Reporter_ID'] != df_cdis_raw['Partner_ID']]
df_cdis_raw = df_cdis_raw[df_cdis_raw['Partner_ID'].isin(df_country_codes['ISO SHORT'].values)]
print('Unique partners number:', len(df_cdis_raw['Partner_ID'].unique()))
df_cdis_raw.rename({'Reporter_ID': 'Reporter', 'Partner_ID': 'Partner'}, axis = 1, inplace = True)
df_cdis_raw = df_cdis_raw.astype({'Indicator': 'str', 'Reporter': 'str', 'Partner': 'str', 
                                  'Value': 'float32'})
df_cdis_raw['Value'].clip(lower = 0.0, inplace = True)
df_cdis_raw['Indicator'].replace(dict_to_download, inplace = True)
df_cdis_raw['Direction'] = df_cdis_raw['Indicator'].str.partition(' ')[0]
df_cdis_raw['Type'] = df_cdis_raw['Indicator'].str.partition(' ')[2].str.partition(' ')[0].replace({'Direct': 'Total'})
### Data saving:
ser_cdis_full = df_cdis_raw.set_index(['Type', 'Direction', 'Date', 'Reporter', 'Partner'])['Value'].sort_index()
del df_cdis_raw

In [34]:
### IMF CDIS: POSITION DATASETS SAVING

ser_cdis_full.loc['Total', 'Outward', :, ser_ison_status.index, df_country_codes['ISO SHORT'].values].droplevel(['Type', 'Direction'])\
    .to_hdf(path_or_buf = str_path_imf_cdis_dataset, key = str_key_do_total_imf_cdis_dataset, mode = 'w', format = 'fixed')
ser_cdis_full.loc['Equity', 'Outward', :, ser_ison_status.index, df_country_codes['ISO SHORT'].values].droplevel(['Type', 'Direction'])\
    .to_hdf(path_or_buf = str_path_imf_cdis_dataset, key = str_key_do_equity_imf_cdis_dataset, mode = 'a', format = 'fixed')
ser_total_di = ser_cdis_full.loc['Total', 'Inward', :, df_country_codes['ISO SHORT'].values, ser_ison_status.index].droplevel(['Type', 'Direction'])
ser_total_di.index.names = ['Date', 'Partner', 'Reporter']
ser_total_di.reorder_levels(['Date', 'Reporter', 'Partner']).sort_index()\
    .to_hdf(path_or_buf = str_path_imf_cdis_dataset, key = str_key_di_total_imf_cdis_dataset, mode = 'a', format = 'fixed')
ser_equity_di = ser_cdis_full.loc['Equity', 'Inward', :, df_country_codes['ISO SHORT'].values, ser_ison_status.index].droplevel(['Type', 'Direction'])
ser_equity_di.index.names = ['Date', 'Partner', 'Reporter']
ser_equity_di.reorder_levels(['Date', 'Reporter', 'Partner']).sort_index()\
    .to_hdf(path_or_buf = str_path_imf_cdis_dataset, key = str_key_di_equity_imf_cdis_dataset, mode = 'a', format = 'fixed')

In [7]:
### IMF CDIS: TOTAL DATA AGGREGATION: DATASETS LOADING

gc.collect()
ser_cdis_asset = pd.read_hdf(path_or_buf = str_path_imf_cdis_dataset, key = str_key_do_total_imf_cdis_dataset)
ser_cdis_asset.name = 'Asset'
ser_cdis_liability_inv = pd.read_hdf(path_or_buf = str_path_imf_cdis_dataset, key = str_key_di_total_imf_cdis_dataset)
ser_cdis_liability_inv.name = 'Liability_Inverted'
df_cdis_total = pd.concat([ser_cdis_asset, ser_cdis_liability_inv], axis = 1, names = 'Data Source').astype('float32').round(2)

In [8]:
### IMF CDIS: TOTAL DATA AGGREGATION: DATA QUALITY RATIOS

gc.collect()

### Defining similarity for investors by date
def get_investor_ratio(df_group):
#    df_group['Asset'].fillna(0.0, inplace = True)
    df_group.fillna(0.0, inplace = True)    
    df_both = df_group.dropna()    
    if (df_both['Asset'].sum() > 0.0):
        flo_result = (df_both['Asset'] - df_both['Liability_Inverted']).abs().clip(upper = df_group['Asset'].max()).sum() / df_group['Asset'].sum() / len(df_group)    
    else:
        flo_result = np.NaN    
    return flo_result
### Defining similarity for borrowers by date
def get_borrower_ratio(df_group):
#    df_group['Liability_Inverted'].fillna(0.0, inplace = True)
    df_group.fillna(0.0, inplace = True)     
    df_both = df_group.dropna()
    if (df_both['Liability_Inverted'].sum() > 0.0):
        flo_result = (df_both['Asset'] - df_both['Liability_Inverted']).abs().clip(upper = df_group['Liability_Inverted'].max()).sum() \
                                                                        / df_group['Liability_Inverted'].sum() / len(df_group)
    else:
        flo_result = np.NaN    
    return flo_result
### Similarity values calculation:
ser_investor_ratio = df_cdis_total.groupby(['Date', 'Reporter']).apply(get_investor_ratio)
ser_investor_ratio.name = 'Investor_Ratio'
ser_borrower_ratio = df_cdis_total.groupby(['Date', 'Partner']).apply(get_borrower_ratio)
ser_borrower_ratio.name = 'Borrower_Ratio'

In [9]:
### IMF CDIS: TOTAL DATA AGGREGATION: SIMILARITY TEST

print(round(ser_borrower_ratio.min(), 4), '/', ser_borrower_ratio.idxmin())
print(round(ser_borrower_ratio.max(), 4), '/', ser_borrower_ratio.idxmax())

#display(df_cdis_total.loc[('2019-12-31', All, 'BR'), :].dropna())
#display(df_cdis_total.loc[('2014-12-31', All, 'CW'), :].dropna())

0.0021 / (Timestamp('2019-12-31 00:00:00'), 'BR')
0.1436 / (Timestamp('2014-12-31 00:00:00'), 'CW')


In [10]:
### IMF CDIS: TOTAL DATA AGGREGATION: SIMILARITY TEST

print(round(ser_investor_ratio.min(), 4), '/', ser_investor_ratio.idxmin())
print(round(ser_investor_ratio.max(), 4), '/', ser_investor_ratio.idxmax())

#display(df_cdis_total.loc[('2014-12-31', 'SE', All), :].dropna())
#display(df_cdis_total.loc[('2016-12-30', 'MT', All), :].dropna())

0.0009 / (Timestamp('2015-12-31 00:00:00'), 'FR')
0.1847 / (Timestamp('2016-12-30 00:00:00'), 'MT')


In [11]:
### IMF CDIS: TOTAL DATA AGGREGATION: ADDING RATIOS

df_cdis_to_augment = df_cdis_total.join(ser_investor_ratio).join(ser_borrower_ratio)
df_cdis_to_augment['Asset_Augmented'] = np.NaN # -999 # 
#df_cpis_augmented['Verified'] = False
df_cdis_to_augment = df_cdis_to_augment.reorder_levels(['Date', 'Reporter', 'Partner'])

In [12]:
### IMF CDIS: TOTAL DATA AGGREGATION: CONDITIONAL REPLACING

gc.collect()
def augment_by_date(df_date, int_option = 0):
    '''
       -1 : Replace NaN Asset values unconditionally    
        0 : Replace NaN Asset values when Investor's Ratio > Borrower's Ratio
        1 : Replace NaN or zero Asset values when Investor's Ratio > Borrower's Ratio
        2 : Replace any Asset values when Investor's Ratio > Borrower's Ratio
    '''
    if (int_option == -1):
        ### Replacing zero Asset & Liability values with NaN:
        df_date.loc[df_date['Asset'] == 0.0, 'Asset'] = np.NaN        
        df_date['Asset_Augmented'] = df_date['Asset'].combine_first(df_date['Liability_Inverted'])
    elif (int_option == 0):
        ### Fill resulting column with not NaN Asset values:
        df_date.loc[df_date['Asset'].notna(), 'Asset_Augmented'] = df_date[df_date['Asset'].notna()]['Asset'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is NaN (and doesn't matter if Borrower Ratio is NaN):
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna(), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna()]['Liability_Inverted'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is bigger than Borrower Ratio:
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio']), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio'])]['Liability_Inverted'].values
    elif (int_option == 1):
        ### Replacing zero Asset values with NaN:
        df_date.loc[df_date['Asset'] == 0.0, 'Asset'] = np.NaN
        ### Fill resulting column with not NaN Asset values:
        df_date.loc[df_date['Asset'].notna(), 'Asset_Augmented'] = df_date.loc[df_date['Asset'].notna(), 'Asset'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is NaN (and doesn't matter if Borrower Ratio is NaN):
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna(), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna()]['Liability_Inverted'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is bigger than Borrower Ratio:
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio']), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio'])]['Liability_Inverted'].values
    else:
        ### Replacing zero Asset & Liability values with NaN:
        df_date.loc[df_date['Asset'] == 0.0, 'Asset'] = np.NaN        
        df_date.loc[df_date['Liability_Inverted'] == 0.0, 'Liability_Inverted'] = np.NaN
        ### Ratios preparation:
        df_date.loc[df_date['Investor_Ratio'].isna(), 'Investor_Ratio'] = 999.0
        df_date.loc[df_date['Borrower_Ratio'].isna(), 'Borrower_Ratio'] = 1000.0
        ### Ratios comparision:
        df_date.loc[df_date['Investor_Ratio'] <= df_date['Borrower_Ratio'], 'Asset_Augmented'] = \
            df_date[df_date['Investor_Ratio'] <= df_date['Borrower_Ratio']]['Asset'].values
        df_date.loc[df_date['Investor_Ratio'] > df_date['Borrower_Ratio'], 'Asset_Augmented'] = \
            df_date[df_date['Investor_Ratio'] > df_date['Borrower_Ratio']]['Liability_Inverted'].values                                       
    return df_date

dict_cdis_augmented = {}
dict_cdis_augmented[-1] = df_cdis_to_augment.groupby('Date').apply(augment_by_date, -1)
dict_cdis_augmented[0] = df_cdis_to_augment.groupby('Date').apply(augment_by_date, 0)
dict_cdis_augmented[1] = df_cdis_to_augment.groupby('Date').apply(augment_by_date, 1)
dict_cdis_augmented[2] = df_cdis_to_augment.groupby('Date').apply(augment_by_date, 2)

In [38]:
### IMF CDIS: TOTAL DATA AGGREGATION: RESULTS TESTING

ser_quantity = df_cdis_to_augment['Asset'].replace({0.0: np.NaN}).groupby(['Date','Reporter']).count()
ser_quantity = ser_quantity[ser_quantity > 0].groupby('Date').count()
ser_quantity.name = 'raw'
ser_quantity.to_excel('Data_Files/Test_Files/Augmentation_Test_' + ser_quantity.name + '.xlsx', merge_cells = False)

for iter_option in dict_cdis_augmented:
    dt_option = dict_cdis_augmented[iter_option]
    ser_quantity = dt_option['Asset_Augmented'].replace({0.0: np.NaN}).groupby(['Date','Reporter']).count()
    ser_quantity = ser_quantity[ser_quantity > 0].groupby('Date').count()
    ser_quantity.name = str(iter_option)
    ser_quantity.to_excel('Data_Files/Test_Files/Augmentation_Test_' + str(iter_option) + '.xlsx', merge_cells = False)

In [39]:
### IMF CDIS: TOTAL DATA AGGREGATION: RESULTS SAVING TO SERIES

ser_total_augmented = dict_cdis_augmented[2]['Asset_Augmented']
ser_total_augmented.name = 'Total'
ser_total_augmented.replace({0.0: np.NaN})\
    .to_hdf(path_or_buf = str_path_imf_cdis_augmented, key = str_key_do_total_imf_cdis_augmented, mode = 'w', format = 'fixed')

In [13]:
### IMF CDIS: TOTAL DATA AGGREGATION: RESULTS CONSOLIDATION TO DATAFRAME AND SAVING

df_augmentation_way = pd.concat([df_cdis_to_augment['Asset'].replace({0.0: np.NaN}), 
                                 dict_cdis_augmented[-1]['Asset_Augmented'], 
                                 dict_cdis_augmented[2]['Asset_Augmented']], 
                                axis = 1, keys = ['Assets_Only', 'Unconditional', 'Option_2'], names = 'Augmentation_Way')
df_augmentation_way.to_hdf(path_or_buf = str_path_imf_cdis_options, key = str_key_total_imf_cdis_options, mode = 'w', format = 'fixed')

In [14]:
### IMF CDIS: EQUITY DATA AGGREGATION: DATASETS LOADING

gc.collect()
ser_cdis_asset = pd.read_hdf(path_or_buf = str_path_imf_cdis_dataset, key = str_key_do_equity_imf_cdis_dataset)
ser_cdis_asset.name = 'Asset'
ser_cdis_liability_inv = pd.read_hdf(path_or_buf = str_path_imf_cdis_dataset, key = str_key_di_equity_imf_cdis_dataset)
ser_cdis_liability_inv.name = 'Liability_Inverted'
df_cdis_equity = pd.concat([ser_cdis_asset, ser_cdis_liability_inv], axis = 1, names = 'Data Source').astype('float32').round(2)

In [15]:
### IMF CDIS: EQUITY DATA AGGREGATION: DATA QUALITY RATIOS

gc.collect()

### Defining similarity for investors by date
def get_investor_ratio(df_group):
#    df_group['Asset'].fillna(0.0, inplace = True)
    df_group.fillna(0.0, inplace = True)    
    df_both = df_group.dropna()    
    if (df_both['Asset'].sum() > 0.0):
        flo_result = (df_both['Asset'] - df_both['Liability_Inverted']).abs().clip(upper = df_group['Asset'].max()).sum() / df_group['Asset'].sum() / len(df_group)    
    else:
        flo_result = np.NaN    
    return flo_result
### Defining similarity for borrowers by date
def get_borrower_ratio(df_group):
#    df_group['Liability_Inverted'].fillna(0.0, inplace = True)
    df_group.fillna(0.0, inplace = True)     
    df_both = df_group.dropna()
    if (df_both['Liability_Inverted'].sum() > 0.0):
        flo_result = (df_both['Asset'] - df_both['Liability_Inverted']).abs().clip(upper = df_group['Liability_Inverted'].max()).sum() \
                                                                        / df_group['Liability_Inverted'].sum() / len(df_group)
    else:
        flo_result = np.NaN    
    return flo_result
### Similarity values calculation:
ser_investor_ratio = df_cdis_equity.groupby(['Date', 'Reporter']).apply(get_investor_ratio)
ser_investor_ratio.name = 'Investor_Ratio'
ser_borrower_ratio = df_cdis_equity.groupby(['Date', 'Partner']).apply(get_borrower_ratio)
ser_borrower_ratio.name = 'Borrower_Ratio'

In [16]:
### IMF CDIS: EQUITY DATA AGGREGATION: SIMILARITY TEST

print(round(ser_borrower_ratio.min(), 4), '/', ser_borrower_ratio.idxmin())
print(round(ser_borrower_ratio.max(), 4), '/', ser_borrower_ratio.idxmax())

#display(df_cdis_equity.loc[('2011-12-30', All, 'BE'), :].dropna())
#display(df_cdis_equity.loc[('2018-12-31', All, 'CW'), :].dropna())

0.0016 / (Timestamp('2011-12-30 00:00:00'), 'BE')
0.1314 / (Timestamp('2018-12-31 00:00:00'), 'CW')


In [17]:
### IMF CDIS: EQUITY DATA AGGREGATION: SIMILARITY TEST

print(round(ser_investor_ratio.min(), 4), '/', ser_investor_ratio.idxmin())
print(round(ser_investor_ratio.max(), 4), '/', ser_investor_ratio.idxmax())

#display(df_cdis_equity.loc[('2014-12-31', 'SE', All), :].dropna())
#display(df_cdis_equity.loc[('2016-12-30', 'MT', All), :].dropna())

0.0007 / (Timestamp('2012-12-31 00:00:00'), 'FR')
0.2911 / (Timestamp('2016-12-30 00:00:00'), 'MT')


In [18]:
### IMF CDIS: EQUITY DATA AGGREGATION: ADDING RATIOS

df_cdis_to_augment = df_cdis_equity.join(ser_investor_ratio).join(ser_borrower_ratio)
df_cdis_to_augment['Asset_Augmented'] = np.NaN # -999 # 
#df_cpis_augmented['Verified'] = False
df_cdis_to_augment = df_cdis_to_augment.reorder_levels(['Date', 'Reporter', 'Partner'])

In [19]:
### IMF CDIS: EQUITY DATA AGGREGATION: CONDITIONAL REPLACING

gc.collect()
def augment_by_date(df_date, int_option = 0):
    '''
       -1 : Replace NaN Asset values unconditionally    
        0 : Replace NaN Asset values when Investor's Ratio > Borrower's Ratio
        1 : Replace NaN or zero Asset values when Investor's Ratio > Borrower's Ratio
        2 : Replace any Asset values when Investor's Ratio > Borrower's Ratio
    '''
    if (int_option == -1):
        ### Replacing zero Asset & Liability values with NaN:
        df_date.loc[df_date['Asset'] == 0.0, 'Asset'] = np.NaN        
        df_date['Asset_Augmented'] = df_date['Asset'].combine_first(df_date['Liability_Inverted'])
    elif (int_option == 0):
        ### Fill resulting column with not NaN Asset values:
        df_date.loc[df_date['Asset'].notna(), 'Asset_Augmented'] = df_date[df_date['Asset'].notna()]['Asset'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is NaN (and doesn't matter if Borrower Ratio is NaN):
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna(), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna()]['Liability_Inverted'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is bigger than Borrower Ratio:
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio']), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio'])]['Liability_Inverted'].values
    elif (int_option == 1):
        ### Replacing zero Asset values with NaN:
        df_date.loc[df_date['Asset'] == 0.0, 'Asset'] = np.NaN
        ### Fill resulting column with not NaN Asset values:
        df_date.loc[df_date['Asset'].notna(), 'Asset_Augmented'] = df_date.loc[df_date['Asset'].notna(), 'Asset'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is NaN (and doesn't matter if Borrower Ratio is NaN):
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna(), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna()]['Liability_Inverted'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is bigger than Borrower Ratio:
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio']), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio'])]['Liability_Inverted'].values
    else:
        ### Replacing zero Asset & Liability values with NaN:
        df_date.loc[df_date['Asset'] == 0.0, 'Asset'] = np.NaN        
        df_date.loc[df_date['Liability_Inverted'] == 0.0, 'Liability_Inverted'] = np.NaN
        ### Ratios preparation:
        df_date.loc[df_date['Investor_Ratio'].isna(), 'Investor_Ratio'] = 999.0
        df_date.loc[df_date['Borrower_Ratio'].isna(), 'Borrower_Ratio'] = 1000.0
        ### Ratios comparision:
        df_date.loc[df_date['Investor_Ratio'] <= df_date['Borrower_Ratio'], 'Asset_Augmented'] = \
            df_date[df_date['Investor_Ratio'] <= df_date['Borrower_Ratio']]['Asset'].values
        df_date.loc[df_date['Investor_Ratio'] > df_date['Borrower_Ratio'], 'Asset_Augmented'] = \
            df_date[df_date['Investor_Ratio'] > df_date['Borrower_Ratio']]['Liability_Inverted'].values                                       
    return df_date

dict_cdis_augmented = {}
dict_cdis_augmented[-1] = df_cdis_to_augment.groupby('Date').apply(augment_by_date, -1)
dict_cdis_augmented[0] = df_cdis_to_augment.groupby('Date').apply(augment_by_date, 0)
dict_cdis_augmented[1] = df_cdis_to_augment.groupby('Date').apply(augment_by_date, 1)
dict_cdis_augmented[2] = df_cdis_to_augment.groupby('Date').apply(augment_by_date, 2)

In [60]:
### IMF CDIS: EQUITY DATA AGGREGATION: RESULTS TESTING

ser_quantity = df_cdis_to_augment['Asset'].replace({0.0: np.NaN}).groupby(['Date','Reporter']).count()
ser_quantity = ser_quantity[ser_quantity > 0].groupby('Date').count()
ser_quantity.name = 'raw'
ser_quantity.to_excel('Data_Files/Test_Files/Augmentation_Test_' + ser_quantity.name + '.xlsx', merge_cells = False)

for iter_option in dict_cdis_augmented:
    dt_option = dict_cdis_augmented[iter_option]
    ser_quantity = dt_option['Asset_Augmented'].replace({0.0: np.NaN}).groupby(['Date','Reporter']).count()
    ser_quantity = ser_quantity[ser_quantity > 0].groupby('Date').count()
    ser_quantity.name = str(iter_option)
    ser_quantity.to_excel('Data_Files/Test_Files/Augmentation_Test_' + str(iter_option) + '.xlsx', merge_cells = False)

In [61]:
### IMF CDIS: EQUITY DATA AGGREGATION: RESULTS SAVING (SERIES)

ser_equity_augmented = dict_cdis_augmented[2]['Asset_Augmented']
ser_equity_augmented.name = 'Total'
ser_equity_augmented.replace({0.0: np.NaN})\
    .to_hdf(path_or_buf = str_path_imf_cdis_augmented, key = str_key_do_equity_imf_cdis_augmented, mode = 'a', format = 'fixed')

In [20]:
### IMF CDIS: EQUITY DATA AGGREGATION: RESULTS CONSOLIDATION TO DATAFRAME AND SAVING

df_augmentation_way = pd.concat([df_cdis_to_augment['Asset'].replace({0.0: np.NaN}), 
                                 dict_cdis_augmented[-1]['Asset_Augmented'], 
                                 dict_cdis_augmented[2]['Asset_Augmented']], 
                                axis = 1, keys = ['Assets_Only', 'Unconditional', 'Option_2'], names = 'Augmentation_Way')
df_augmentation_way.to_hdf(path_or_buf = str_path_imf_cdis_options, key = str_key_equity_imf_cdis_options, mode = 'a', format = 'fixed')

In [62]:
### DIRECT INVESTMENT DATA AGGREGATION: DATASETS LOADING (SERIES)

ser_oecd_total = pd.read_hdf(path_or_buf = str_path_oecd_fdi_augmented, key = str_key_do_total_oecd_fdi_augmented)
ser_oecd_equity = pd.read_hdf(path_or_buf = str_path_oecd_fdi_augmented, key = str_key_do_equity_oecd_fdi_augmented)
ser_cdis_total = pd.read_hdf(path_or_buf = str_path_imf_cdis_augmented, key = str_key_do_total_imf_cdis_augmented)
ser_cdis_equity = pd.read_hdf(path_or_buf = str_path_imf_cdis_augmented, key = str_key_do_equity_imf_cdis_augmented)

In [63]:
### DIRECT INVESTMENT DATA AGGREGATION: DATASETS COMBINING (SERIES)

ser_total_combined = ser_cdis_total.combine_first(ser_oecd_total)
ser_total_combined.to_hdf(path_or_buf = str_path_direct_total_augmented, key = str_key_direct_augmented, mode = 'w', format = 'fixed')
ser_equity_combined = ser_cdis_equity.combine_first(ser_oecd_equity)
ser_equity_combined.to_hdf(path_or_buf = str_path_direct_equity_augmented, key = str_key_direct_augmented, mode = 'w', format = 'fixed')

In [21]:
### DIRECT INVESTMENT DATA AGGREGATION: DATASETS LOADING (DATAFRAMES)

df_total_oecd_options = pd.read_hdf(path_or_buf = str_path_oecd_fdi_options, key = str_key_total_oecd_fdi_options)
df_total_cdis_options = pd.read_hdf(path_or_buf = str_path_imf_cdis_options, key = str_key_total_imf_cdis_options)
df_equity_oecd_options = pd.read_hdf(path_or_buf = str_path_oecd_fdi_options, key = str_key_equity_oecd_fdi_options)
df_equity_cdis_options = pd.read_hdf(path_or_buf = str_path_imf_cdis_options, key = str_key_equity_imf_cdis_options)

In [22]:
### DIRECT INVESTMENT DATA AGGREGATION: DATASAETS COMBINING (DATAFRAMES)

df_direct_total_options = df_total_cdis_options.combine_first(df_total_oecd_options).replace({0.0: np.NaN})
df_direct_total_options.to_hdf(path_or_buf = str_path_total_direct_options, key = str_key_total_direct_options, mode = 'w', format = 'fixed')
df_direct_equity_options = df_equity_cdis_options.combine_first(df_equity_oecd_options).replace({0.0: np.NaN})
df_direct_equity_options.to_hdf(path_or_buf = str_path_equity_direct_options, key = str_key_equity_direct_options, mode = 'w', format = 'fixed')