In [1]:
### IMF CPIS: BILATERAL EQUITY & DEBT INVESTMENT POSITIONS

In [2]:
### RUN EVERY TIME: INITIALIZATION

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1) ### To display long strings
import math
import requests
import json ### To correct JSON structure before unpacking
import gc
import os
import datetime
import time
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import seaborn as sns
%load_ext line_profiler

In [3]:
### RUN EVERY TIME: VERSION CONTROL

from platform import python_version
print('pandas version: ', pd.__version__)
print('numpy version: ', np.__version__)
print('python version: ', python_version())

pandas version:  0.25.3
numpy version:  1.17.2
python version:  3.7.4


In [4]:
### RUN EVERY TIME: MAIN CONSTANTS

### MultiIndex level slice constant:
All = slice(None)
### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### Detailed IMF CPIS dataset:
str_path_imf_cpis_detailed_raw = 'Data_Files/Source_Files/cpis_detailed_raw.h5'
str_key_imf_cpis_assets = 'cpis_detailed_assets'
str_key_imf_cpis_liabilities = 'cpis_detailed_liabilities'
str_path_imf_cpis_total_augmented = 'Data_Files/Source_Files/cpis_total_augmented.h5'
str_key_imf_cpis_total_augmented = 'cpis_total_augmented'
str_path_total_imf_cpis_options = 'Data_Files/Source_Files/cpis_total_options.h5'
str_key_total_imf_cpis_options = 'cpis_total_options'
### Filtered IMF CPIS dataset:
str_path_imf_cpis_filtered = 'Data_Files/Source_Files/cpis_filtered.h5'
str_key_imf_cpis_filtered_asset = 'cpis_filtered_asset'
str_key_imf_cpis_filtered_liability = 'cpis_filtered_liability'
str_path_imf_cpis_filtered_augmented = 'Data_Files/Source_Files/cpis_filtered_augmented.h5'
str_key_imf_cpis_filtered_augmented = 'cpis_filtered_augmented'
str_path_filtered_imf_cpis_options = 'Data_Files/Source_Files/cpis_filtered_options.h5'
str_key_filtered_imf_cpis_options = 'cpis_filtered_options'
### Direct Investment Options:
str_path_total_direct_options = 'Data_Files/Source_Files/direct_total_options.h5'
str_key_total_direct_options = 'direct_total_options'
str_path_equity_direct_options = 'Data_Files/Source_Files/direct_equity_options.h5'
str_key_equity_direct_options = 'direct_equity_options'
### Full Investment Options:
str_path_total_investment_options = 'Data_Files/Source_Files/investment_total_options.h5'
str_key_total_investment_options = 'investment_total_options'
str_path_filtered_investment_options = 'Data_Files/Source_Files/investment_filtered_options.h5'
str_key_filtered_investment_options = 'investment_filtered_options'
### Technical Constants:
str_date_end = '2022-10-31'
date_start = pd.Timestamp('1989-12-29')
date_end = pd.Timestamp(str_date_end)
date_ison = pd.Timestamp('1994-12-31')

In [5]:
### DEFINING COUNTRY CODES EXTRACTOR

def get_country_codes(use_local_copy = False):  
    ### In case if URL is unavailable:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/countrycode.html'
    ### Online extraction:
    else:
        url_country_code = 'https://countrycode.org/'
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    df_result = df_full_codes[['ISO SHORT', 'ISO LONG']].sort_index()    
    df_result.index = df_result.index.str.upper()
    ### Results output:
    return df_result

In [6]:
### DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE (TO BE IGNORED IN PRODUCT CODE)

def ison_membership_converting(str_path_universe, date_end, bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(engine = 'openpyxl', io = str_path_universe, sheet_name = 'Switchers', header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

In [7]:
### RUN EVERY TIME: COMMON DATA EXTRACTION STEPS

### World Country Codes:
df_country_codes = get_country_codes()
### ISON membership history:
ser_ison_membership = ison_membership_converting(str_path_universe, pd.to_datetime(str_date_end))
### ISON LONG IDs list:
list_ison_long = list(df_country_codes.loc[df_country_codes['ISO SHORT'].isin(ser_ison_membership.index.get_level_values('Country').unique()), 'ISO LONG'].values)
### ISON current status:
ser_ison_status = ser_ison_membership.loc[str_date_end].droplevel('Date')
### ISON stats:
int_ison_number = len(list_ison_long)
list_regions = ['DM', 'EM', 'FM']
dict_ison_len = {}
dict_ison_len['Full Universe'] = int_ison_number
for iter_region in list_regions:
    dict_ison_len[iter_region] = len(ser_ison_status[ser_ison_status == iter_region])
ser_market_len = pd.Series(dict_ison_len)
ser_market_len.index.names = ['Market']    

In [8]:
### IMF CPIS: GENERAL DATA PREPARATION

### Constants:
All = slice(None)
dict_request_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
str_imf_base_url = 'http://dataservices.imf.org/REST/SDMX_JSON.svc/'
str_imf_dataflow_add = 'DataFlow'
str_imf_datastructure_add = 'DataStructure/'
str_imf_codelist_add = 'CodeList/'
str_imf_dataset_add = 'CompactData/'
int_seconds_to_sleep = 1
int_imf_country_limit = 30

In [9]:
### IMF CPIS: REQUESTS SESSION INITIALIZING

request_session = requests.Session()
### For avoiding data request errors from IMF Data Service:
request_session.headers.update(dict_request_headers)

In [10]:
### IMF CPIS: DATAFLOW SEARCHING

obj_imf_dataflow_list = request_session.get(str_imf_base_url + str_imf_dataflow_add).json()
df_imf_dataflow = pd.DataFrame(obj_imf_dataflow_list['Structure']['Dataflows']['Dataflow'])
df_imf_dataflow = df_imf_dataflow.assign(Description = df_imf_dataflow['Name'].apply(pd.Series)['#text'].values)[['@id', 'Description']]
ser_imf_dataflow = df_imf_dataflow.set_index('@id', drop = True).squeeze()
### Searching DataFlow code for further requests:
str_imf_cpis_id = ser_imf_dataflow[ser_imf_dataflow.str.contains('CPIS')].index[0].replace('DS-', '')
print(str_imf_cpis_id)

CPIS


In [11]:
### IMF CPIS: DATASTRUCTURE SEARCHING

obj_imf_cpis_structure = request_session.get(str_imf_base_url + str_imf_datastructure_add + str_imf_cpis_id).json()
df_imf_cpis_params = pd.DataFrame(obj_imf_cpis_structure['Structure']['KeyFamilies']['KeyFamily']['Components']['Dimension'])\
                                [['@conceptRef', '@codelist', '@isFrequencyDimension']]
### Receiving DataFlow parameters and code lists for each of them:
print(df_imf_cpis_params)

          @conceptRef          @codelist @isFrequencyDimension
0  FREQ                CL_FREQ            true                
1  REF_AREA            CL_AREA_CPIS       NaN                 
2  INDICATOR           CL_INDICATOR_CPIS  NaN                 
3  REF_SECTOR          CL_SECTOR_CPIS     NaN                 
4  COUNTERPART_SECTOR  CL_SECTOR_CPIS     NaN                 
5  COUNTERPART_AREA    CL_AREA_CPIS       NaN                 


In [12]:
### IMF CPIS: CODES DESCRIPTIONS LOADING

for int_counter, str_param_code in enumerate(df_imf_cpis_params['@codelist']):
    if (int_counter == 2):
        time.sleep(int_seconds_to_sleep)    
        obj_imf_cpis_param = request_session.get(str_imf_base_url + str_imf_codelist_add + str_param_code).json()
        df_imf_cpis_param =  pd.DataFrame(obj_imf_cpis_param['Structure']['CodeLists']['CodeList']['Code'])
        ### Receiving values for each code list:
        df_imf_cpis_param = df_imf_cpis_param.assign(Text = df_imf_cpis_param['Description'].apply(pd.Series)['#text'].values)[['@value', 'Text']]
#        print(int_counter, ':', df_imf_cpis_params.iloc[int_counter, All]['@conceptRef'], ':', str_param_code, ':\n', df_imf_cpis_param.head(20))
        dict_indicator = dict(zip(df_imf_cpis_param[: 10]['@value'], df_imf_cpis_param[: 10]['Text']))
    elif (int_counter == 3):
        time.sleep(int_seconds_to_sleep)    
        obj_imf_cpis_param = request_session.get(str_imf_base_url + str_imf_codelist_add + str_param_code).json()
        df_imf_cpis_param =  pd.DataFrame(obj_imf_cpis_param['Structure']['CodeLists']['CodeList']['Code'])
        ### Receiving values for each code list:
        df_imf_cpis_param = df_imf_cpis_param.assign(Text = df_imf_cpis_param['Description'].apply(pd.Series)['#text'].values)[['@value', 'Text']]
#        print(int_counter, ':', df_imf_cpis_params.iloc[int_counter, All]['@conceptRef'], ':', str_param_code, ':\n', df_imf_cpis_param.head(20))        
        dict_sector = dict(zip(df_imf_cpis_param['@value'], df_imf_cpis_param['Text']))
        list_sector_filtered = ['T', 'CB', 'GG', 'HH', 'NP']

list_ison_countries = sorted(list(map(str, ser_ison_membership.index.get_level_values(1).unique())))
str_cpis_freq = 'A' # 'B' # 
#str_cpis_asset_indicator = 'I_A_T_T_T_BP6_USD' 
#str_cpis_liability_indicator = 'I_L_T_T_T_BP6_USD'
#str_cpis_ref_sector = 'T'
#str_cpis_cp_sector = 'T'
# 0: FREQ == 'B' # Semi-annual frequency - they don't have Quaterly or Monthly frequency data
# 1: REF_AREA == '??' # Country
# 2: INDICATOR  == 'I_A_T_T_T_BP6_USD' # Assets, Total Investment, BPM6, US Dollars & I_L_T_T_T_BP6_USD    Liabilities, Total Investment, BPM6, US Dollars
# 3: REF_SECTOR == 'T' # Total Holdings (all sectors)
# 4: COUNTERPART_SECTOR  == 'T' # Total Holdings (all sectors)
# 5: COUNTERPART_AREA == '??' # Country

In [None]:
### IMF CPIS : REPORTED PORTFOLIO INVESTMENT ASSETS DATASET RETRIEVING

gc.collect()
### List of bilateral dataframes for future concatenation:
list_cpis_bilateral = [] 
### Beggining of request URL:
str_cpis_const_url = str_imf_base_url + str_imf_dataset_add + str_imf_cpis_id + '/' 
### Looping over reporter:
for iter_investor in list_ison_countries:
#for iter_reporter in ['US', 'BD']:  
    ### Looping over indicator:
    for iter_indicator in dict_indicator:        
        if (iter_indicator[2] == 'A'):
            str_reporter_sector = '+'.join(list_sector_filtered)        
            str_partner_sector = '+'.join(list_sector_filtered)
            str_cpis_full_url = str_cpis_const_url + '.'.join([str_cpis_freq, iter_investor, iter_indicator, str_reporter_sector, str_partner_sector])
            obj_cpis_set = request_session.get(str_cpis_full_url)
            ### Data reading as JSON:
            dict_cpis_set = json.loads(obj_cpis_set.text.replace('@OBS_STATUS', '@OBS_VALUE'))
            ### Converting each bilateral dataset to dataframe and it's mungling:
            if ('Series' in dict_cpis_set['CompactData']['DataSet']):
                if isinstance(dict_cpis_set['CompactData']['DataSet']['Series'], list):
                    list_series = dict_cpis_set['CompactData']['DataSet']['Series']
                else:
                    list_series = [dict_cpis_set['CompactData']['DataSet']['Series']]
                for dict_cpis_pair in list_series:
                    if isinstance(dict_cpis_pair['Obs'], list):
                        dict_bilateral = dict_cpis_pair['Obs']
                    else:
                        dict_bilateral = [dict_cpis_pair['Obs']]
                    df_cpis_bilateral = pd.DataFrame(dict_bilateral)
                    df_cpis_bilateral = df_cpis_bilateral[['@TIME_PERIOD', '@OBS_VALUE']]
                    df_cpis_bilateral.columns = ['Date', 'Value']
                    df_cpis_bilateral = df_cpis_bilateral.assign(Indicator = dict_cpis_pair['@INDICATOR'])
                    df_cpis_bilateral = df_cpis_bilateral.assign(Reporter_Sector = dict_cpis_pair['@REF_SECTOR'])
                    df_cpis_bilateral = df_cpis_bilateral.assign(Partner_Sector = dict_cpis_pair['@COUNTERPART_SECTOR'])                    
                    df_cpis_bilateral = df_cpis_bilateral.assign(Reporter_ID = dict_cpis_pair['@REF_AREA'])
                    df_cpis_bilateral = df_cpis_bilateral.assign(Partner_ID = dict_cpis_pair['@COUNTERPART_AREA'])
                    list_cpis_bilateral.append(df_cpis_bilateral)  
            else:
                print('No data in response of the next request:\n', str_cpis_full_url)
            time.sleep(int_seconds_to_sleep)                    
#        break
    print(iter_investor, ': loading completed')
#    break
### Bilateral datasets aggregating:
df_cpis_raw = pd.concat(list_cpis_bilateral, axis = 0, ignore_index = True)
df_cpis_raw['Date'] = pd.to_datetime(df_cpis_raw['Date']) + pd.offsets.BYearEnd()
df_cpis_raw.loc[df_cpis_raw['Value'] == 'C', 'Value'] = np.NaN
df_cpis_raw.loc[df_cpis_raw['Value'] == '-', 'Value'] = np.NaN
df_cpis_raw = df_cpis_raw[df_cpis_raw['Reporter_ID'] != df_cpis_raw['Partner_ID']]
df_cpis_raw = df_cpis_raw[df_cpis_raw['Partner_ID'].isin(df_country_codes['ISO SHORT'].values)]
print('Unique partners number:', len(df_cpis_raw['Partner_ID'].unique()))
df_cpis_raw.rename({'Reporter_ID': 'Reporter', 'Partner_ID': 'Partner'}, axis = 1, inplace = True)
df_cpis_raw = df_cpis_raw.astype({'Indicator': 'str', 'Reporter_Sector': 'str', 'Partner_Sector': 'str', 'Reporter': 'str', 'Partner': 'str', 
                                  'Value': 'float32'})    
### Data saving:
ser_cpis_asset = df_cpis_raw.set_index(['Date', 'Indicator', 'Reporter_Sector', 'Partner_Sector', 'Reporter', 'Partner'])['Value'].sort_index().astype('float32')
del df_cpis_raw
gc.collect()
ser_cpis_asset.to_hdf(path_or_buf = str_path_imf_cpis_detailed_raw, key = str_key_imf_cpis_assets, mode = 'w', format = 'fixed')

In [None]:
### IMF CPIS : REPORTED PORTFOLIO INVESTMENT LIABILITIES DATASET RETRIEVING

gc.collect()
### List of bilateral dataframes for future concatenation:
list_cpis_bilateral = [] 
### Beggining of request URL:
str_cpis_const_url = str_imf_base_url + str_imf_dataset_add + str_imf_cpis_id + '/' 
### Looping over reporter:
for iter_investor in list_ison_countries:
#for iter_investor in ['US', 'BD']:  
    ### Looping over indicator:
    for iter_indicator in dict_indicator:        
        if (iter_indicator[2] == 'L'):
            str_reporter_sector = '+'.join(list_sector_filtered)        
            str_partner_sector = '+'.join(list_sector_filtered)
            str_cpis_full_url = str_cpis_const_url + '.'.join([str_cpis_freq, '', iter_indicator, str_reporter_sector, str_partner_sector, iter_investor])
            obj_cpis_set = request_session.get(str_cpis_full_url)
            ### Data reading as JSON:
            dict_cpis_set = json.loads(obj_cpis_set.text.replace('@OBS_STATUS', '@OBS_VALUE'))
            ### Converting each bilateral dataset to dataframe and it's mungling:
            if ('Series' in dict_cpis_set['CompactData']['DataSet']):
                if isinstance(dict_cpis_set['CompactData']['DataSet']['Series'], list):
                    list_series = dict_cpis_set['CompactData']['DataSet']['Series']
                else:
                    list_series = [dict_cpis_set['CompactData']['DataSet']['Series']]
                for dict_cpis_pair in list_series:
                    if isinstance(dict_cpis_pair['Obs'], list):
                        dict_bilateral = dict_cpis_pair['Obs']
                    else:
                        dict_bilateral = [dict_cpis_pair['Obs']]
                    df_cpis_bilateral = pd.DataFrame(dict_bilateral)
                    df_cpis_bilateral = df_cpis_bilateral[['@TIME_PERIOD', '@OBS_VALUE']]
                    df_cpis_bilateral.columns = ['Date', 'Value']
                    df_cpis_bilateral = df_cpis_bilateral.assign(Indicator = dict_cpis_pair['@INDICATOR'])
                    df_cpis_bilateral = df_cpis_bilateral.assign(Reporter_S = dict_cpis_pair['@REF_SECTOR'])
                    df_cpis_bilateral = df_cpis_bilateral.assign(Partner_S = dict_cpis_pair['@COUNTERPART_SECTOR'])                    
                    df_cpis_bilateral = df_cpis_bilateral.assign(Reporter_ID = dict_cpis_pair['@REF_AREA'])
                    df_cpis_bilateral = df_cpis_bilateral.assign(Partner_ID = dict_cpis_pair['@COUNTERPART_AREA'])
                    list_cpis_bilateral.append(df_cpis_bilateral)  
            else:
                print('No data in response of the next request:\n', str_cpis_full_url)
            time.sleep(int_seconds_to_sleep)                    
#        break
    print(iter_investor, ': loading completed')
#    break
### Bilateral datasets aggregating:
df_cpis_raw = pd.concat(list_cpis_bilateral, axis = 0, ignore_index = True)
df_cpis_raw['Date'] = pd.to_datetime(df_cpis_raw['Date']) + pd.offsets.BYearEnd()
df_cpis_raw.loc[df_cpis_raw['Value'] == 'C', 'Value'] = np.NaN
df_cpis_raw.loc[df_cpis_raw['Value'] == '-', 'Value'] = np.NaN
df_cpis_raw = df_cpis_raw[df_cpis_raw['Reporter_ID'] != df_cpis_raw['Partner_ID']]
df_cpis_raw = df_cpis_raw[df_cpis_raw['Reporter_ID'].isin(df_country_codes['ISO SHORT'].values)]
print('Unique reporters number:', len(df_cpis_raw['Reporter_ID'].unique()))
df_cpis_raw.rename({'Reporter_ID': 'Partner', 'Partner_ID': 'Reporter', 'Reporter_S': 'Partner_Sector', 'Partner_S': 'Reporter_Sector'}, axis = 1, inplace = True)
df_cpis_raw = df_cpis_raw.astype({'Indicator': 'str', 'Reporter_Sector': 'str', 'Partner_Sector': 'str', 'Reporter': 'str', 'Partner': 'str', 
                                  'Value': 'float32'})    
### Data saving:
ser_cpis_liability_inv = df_cpis_raw.set_index(['Date', 'Indicator', 'Reporter_Sector', 'Partner_Sector', 'Reporter', 'Partner'])['Value'].sort_index()\
                                    .astype('float32')
del df_cpis_raw
gc.collect()
ser_cpis_liability_inv.to_hdf(path_or_buf = str_path_imf_cpis_detailed_raw, key = str_key_imf_cpis_liabilities, mode = 'a', format = 'fixed')

In [None]:
### IMF CPIS: COUNTERPART SECTOR TEST

gc.collect()
ser_cpis_asset = pd.read_hdf(path_or_buf = str_path_imf_cpis_detailed_raw, key = str_key_imf_cpis_assets)
ser_cpis_asset_total = ser_cpis_asset.loc[:, 'I_A_T_T_T_BP6_USD', 'T', :, :, :]
ser_reporter_sum = ser_cpis_asset_total.groupby(['Date', 'Partner_Sector', 'Reporter']).sum()
df_reporter_detailed = ser_reporter_sum.unstack('Partner_Sector').dropna(subset = ['CB', 'GG'], how = 'all')
df_reporter_detailed['CB_Share'] = df_reporter_detailed['CB'] / df_reporter_detailed['T'] * 100
df_reporter_detailed['GG_Share'] = df_reporter_detailed['GG'] / df_reporter_detailed['T'] * 100
display(df_reporter_detailed[['CB_Share', 'GG_Share']].groupby('Reporter').median())
#display(df_reporter_detailed.loc[(All, 'US'), ['CB_Share', 'GG_Share']])

In [9]:
### IMF CPIS: TOTAL DATA AGGREGATION: DATASETS LOADING

gc.collect()
ser_cpis_asset = pd.read_hdf(path_or_buf = str_path_imf_cpis_detailed_raw, key = str_key_imf_cpis_assets)
ser_cpis_asset_total = ser_cpis_asset.loc[:, 'I_A_T_T_T_BP6_USD', 'T', 'T', :, :]
ser_cpis_asset_total.name = 'Asset'
ser_cpis_liability_inv = pd.read_hdf(path_or_buf = str_path_imf_cpis_detailed_raw, key = str_key_imf_cpis_liabilities)
ser_cpis_liability_inv_total = ser_cpis_liability_inv.loc[:, 'I_L_T_T_T_BP6_USD', 'T', 'T', :, :]
ser_cpis_liability_inv_total.name = 'Liability_Inverted'
df_cpis_total = pd.concat([ser_cpis_asset_total, ser_cpis_liability_inv_total], axis = 1, names = 'Data Source').astype('float32').round(2)

In [10]:
### IMF CPIS: TOTAL DATA AGGREGATION: DATA QUALITY RATIOS

gc.collect()

### Defining similarity for investors by date
def get_investor_ratio(df_group):
#    df_group['Asset'].fillna(0.0, inplace = True)
    df_group.fillna(0.0, inplace = True)    
    df_both = df_group.dropna()    
    if (df_both['Asset'].sum() > 0.0):
        flo_result = (df_both['Asset'] - df_both['Liability_Inverted']).abs().clip(upper = df_group['Asset'].max()).sum() / df_group['Asset'].sum() / len(df_group)    
    else:
        flo_result = np.NaN    
    return flo_result
### Defining similarity for borrowers by date
def get_borrower_ratio(df_group):
#    df_group['Liability_Inverted'].fillna(0.0, inplace = True)
    df_group.fillna(0.0, inplace = True)     
    df_both = df_group.dropna()
    if (df_both['Liability_Inverted'].sum() > 0.0):
        flo_result = (df_both['Asset'] - df_both['Liability_Inverted']).abs().clip(upper = df_group['Liability_Inverted'].max()).sum() \
                                                                        / df_group['Liability_Inverted'].sum() / len(df_group)
    else:
        flo_result = np.NaN    
    return flo_result
### Similarity values calculation:
ser_investor_ratio = df_cpis_total.groupby(['Date', 'Reporter']).apply(get_investor_ratio)
ser_investor_ratio.name = 'Investor_Ratio'
ser_borrower_ratio = df_cpis_total.groupby(['Date', 'Partner']).apply(get_borrower_ratio)
ser_borrower_ratio.name = 'Borrower_Ratio'

In [11]:
### IMF CPIS: TOTAL DATA AGGREGATION: SIMILARITY TEST

print(round(ser_borrower_ratio.min(), 4), '/', ser_borrower_ratio.idxmin())
print(round(ser_borrower_ratio.max(), 4), '/', ser_borrower_ratio.idxmax())

#display(df_cpis_total.loc[('2018-12-31', All, 'RO'), :])
#display(df_cpis_total.loc[('1997-12-31', All, 'ES'), :])

0.002 / (Timestamp('2018-12-31 00:00:00'), 'RO')
0.8929 / (Timestamp('1997-12-31 00:00:00'), 'ES')


In [12]:
### IMF CPIS: TOTAL DATA AGGREGATION: SIMILARITY TEST

print(round(ser_investor_ratio.min(), 4), '/', ser_investor_ratio.idxmin())
print(round(ser_investor_ratio.max(), 4), '/', ser_investor_ratio.idxmax())

#display(df_cpis_total.loc[('2010-12-31', 'NZ', All), :].dropna(subset = ['Liability_Inverted']))
#display(df_cpis_total.loc[('2002-12-31', 'UA', All), :])

0.0023 / (Timestamp('2010-12-31 00:00:00'), 'NZ')
0.25 / (Timestamp('2002-12-31 00:00:00'), 'UA')


In [13]:
### IMF CPIS: TOTAL DATA AGGREGATION: ADDING RATIOS

#df_cpis_to_augment = df_cpis_total.join(ser_investor_rank).join(ser_borrower_rank)
df_cpis_to_augment = df_cpis_total.join(ser_investor_ratio).join(ser_borrower_ratio)
df_cpis_to_augment['Asset_Augmented'] = np.NaN # -999 # 
#df_cpis_augmented['Verified'] = False
df_cpis_to_augment = df_cpis_to_augment.reorder_levels(['Date', 'Reporter', 'Partner'])

In [14]:
### IMF CPIS: TOTAL DATA AGGREGATION: CONDITIONAL REPLACING

gc.collect()
def augment_by_date(df_date, int_option = 0):
    '''
       -1 : Replace NaN Asset values unconditionally
        0 : Replace NaN Asset values when Investor's Ratio > Borrower's Ratio
        1 : Replace NaN or zero Asset values when Investor's Ratio > Borrower's Ratio
        2 : Replace any Asset values when Investor's Ratio > Borrower's Ratio
    '''
    if (int_option == -1):
        ### Replacing zero Asset & Liability values with NaN:
        df_date.loc[df_date['Asset'] == 0.0, 'Asset'] = np.NaN        
        df_date['Asset_Augmented'] = df_date['Asset'].combine_first(df_date['Liability_Inverted'])
    elif (int_option == 0):
        ### Fill resulting column with not NaN Asset values:
        df_date.loc[df_date['Asset'].notna(), 'Asset_Augmented'] = df_date[df_date['Asset'].notna()]['Asset'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is NaN (and doesn't matter if Borrower Ratio is NaN):
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna(), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna()]['Liability_Inverted'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is bigger than Borrower Ratio:
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio']), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio'])]['Liability_Inverted'].values
    elif (int_option == 1):
        ### Replacing zero Asset values with NaN:
        df_date.loc[df_date['Asset'] == 0.0, 'Asset'] = np.NaN
        ### Fill resulting column with not NaN Asset values:
        df_date.loc[df_date['Asset'].notna(), 'Asset_Augmented'] = df_date.loc[df_date['Asset'].notna(), 'Asset'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is NaN (and doesn't matter if Borrower Ratio is NaN):
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna(), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna()]['Liability_Inverted'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is bigger than Borrower Ratio:
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio']), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio'])]['Liability_Inverted'].values
    else:
        ### Replacing zero Asset & Liability values with NaN:
        df_date.loc[df_date['Asset'] == 0.0, 'Asset'] = np.NaN        
        df_date.loc[df_date['Liability_Inverted'] == 0.0, 'Liability_Inverted'] = np.NaN
        ### Ratios preparation:
        df_date.loc[df_date['Investor_Ratio'].isna(), 'Investor_Ratio'] = 999.0
        df_date.loc[df_date['Borrower_Ratio'].isna(), 'Borrower_Ratio'] = 1000.0
        ### Ratios comparision:
        df_date.loc[df_date['Investor_Ratio'] <= df_date['Borrower_Ratio'], 'Asset_Augmented'] = \
            df_date[df_date['Investor_Ratio'] <= df_date['Borrower_Ratio']]['Asset'].values
        df_date.loc[df_date['Investor_Ratio'] > df_date['Borrower_Ratio'], 'Asset_Augmented'] = \
            df_date[df_date['Investor_Ratio'] > df_date['Borrower_Ratio']]['Liability_Inverted'].values                                       
    return df_date

dict_cpis_augmented = {}
dict_cpis_augmented[-1] = df_cpis_to_augment.groupby('Date').apply(augment_by_date, -1)
dict_cpis_augmented[0] = df_cpis_to_augment.groupby('Date').apply(augment_by_date, 0)
dict_cpis_augmented[1] = df_cpis_to_augment.groupby('Date').apply(augment_by_date, 1)
dict_cpis_augmented[2] = df_cpis_to_augment.groupby('Date').apply(augment_by_date, 2)

In [50]:
### IMF CPIS: TOTAL DATA AGGREGATION: RESULTS TESTING

ser_quantity = df_cpis_to_augment['Asset'].replace({0.0: np.NaN}).groupby(['Date','Reporter']).count()
ser_quantity = ser_quantity[ser_quantity > 0].groupby('Date').count()
ser_quantity.name = 'raw'
ser_quantity.to_excel('Data_Files/Test_Files/Augmentation_Test_' + ser_quantity.name + '.xlsx', merge_cells = False)

for iter_option in dict_cpis_augmented:
    dt_option = dict_cpis_augmented[iter_option]
    ser_quantity = dt_option['Asset_Augmented'].replace({0.0: np.NaN}).groupby(['Date','Reporter']).count()
    ser_quantity = ser_quantity[ser_quantity > 0].groupby('Date').count()
    ser_quantity.name = str(iter_option)
    ser_quantity.to_excel('Data_Files/Test_Files/Augmentation_Test_' + str(iter_option) + '.xlsx', merge_cells = False)

In [18]:
### IMF CPIS: TOTAL DATA AGGREGATION: RESULTS SAVING (SERIES)

ser_total_augmented = dict_cpis_augmented[2]['Asset_Augmented']
ser_total_augmented.name = 'Total'
ser_total_augmented.replace({0.0: np.NaN})\
    .to_hdf(path_or_buf = str_path_imf_cpis_total_augmented, key = str_key_imf_cpis_total_augmented, mode = 'w', format = 'fixed')

In [15]:
### IMF CPIS: TOTAL DATA AGGREGATION: RESULTS CONSOLIDATION TO DATAFRAME AND SAVING

df_augmentation_way = pd.concat([df_cpis_to_augment['Asset'].replace({0.0: np.NaN}), 
                                 dict_cpis_augmented[-1]['Asset_Augmented'], 
                                 dict_cpis_augmented[2]['Asset_Augmented']], 
                                axis = 1, keys = ['Assets_Only', 'Unconditional', 'Option_2'], names = 'Augmentation_Way')
df_augmentation_way.to_hdf(path_or_buf = str_path_total_imf_cpis_options, key = str_key_total_imf_cpis_options, mode = 'w', format = 'fixed')

In [16]:
### IMF CPIS: RAW ASSET DATA LOADING

gc.collect()
ser_cpis_asset = pd.read_hdf(path_or_buf = str_path_imf_cpis_detailed_raw, key = str_key_imf_cpis_assets)

In [21]:
### IMF CPIS: ASSSET FILTERING

gc.collect()

list_valid_indicators = ['I_A_T_T_T_BP6_USD', 'I_A_D_S_T_BP6_USD']
list_valid_reporter_sectors = ['T', 'CB', 'GG']
list_valid_partner_sectors = ['T']
list_valid_partners = df_country_codes['ISO SHORT'].values

ser_asset_filtered = ser_cpis_asset.loc[:, list_valid_indicators, list_valid_reporter_sectors, list_valid_partner_sectors, :, list_valid_partners]\
                     .droplevel('Partner_Sector').reorder_levels([0, 1, 3, 4, 2]).sort_index().astype('float32')
del ser_cpis_asset

In [22]:
### IMF CPIS: ASSET REPORTER SECTOR CLEARING

gc.collect()

def reindex_reporter_sectors(ser_group):
    ser_group = ser_group.droplevel(['Date', 'Indicator', 'Reporter', 'Partner'])
    if (len(ser_group.index) < 3):
        ser_group = ser_group.reindex(list_valid_reporter_sectors).fillna(0.0)
    ser_sectors_cleared = ser_group['T'] - (ser_group['CB'] + ser_group['GG'])
    return ser_sectors_cleared

def get_commerce(ser_raw):
    return ser_raw.groupby(['Date', 'Indicator', 'Reporter', 'Partner']).apply(reindex_reporter_sectors)

#list_test_date = ['1997-12-31', '2021-12-31'] # ['2021-12-31'] # 
#list_test_indicator = ['I_A_D_S_T_BP6_USD']
#list_test_reporter = ['PT', 'AT']
#list_test_partner = ['US', 'IT'] # ['IT'] # 

#ser_test_raw = ser_asset_filtered.loc[:, :, list_test_reporter, :, :]
#display(ser_test_raw.loc[list_test_date, list_test_indicator, list_test_reporter, list_test_partner, :])
#ser_test_res = ser_test_raw.groupby(['Date', 'Indicator', 'Reporter', 'Partner']).apply(reindex_reporter_sectors)
#ser_test_res = get_commerce(ser_test_raw)
#display(ser_test_res.loc[list_test_date, list_test_indicator, list_test_reporter, list_test_partner])

#%timeit get_commerce(ser_test_raw)

#%lprun -f reindex_reporter_sectors get_commerce(ser_test_raw)

ser_asset_commerce = ser_asset_filtered.groupby(['Date', 'Indicator', 'Reporter', 'Partner']).apply(reindex_reporter_sectors)
ser_asset_commerce = ser_asset_commerce.reorder_levels([0, 2, 3, 1]).sort_index()

In [26]:
### IMF CPIS: ASSET CLEARING TEST

list_test_date = ['1997-12-31', '2021-12-31'] # ['2021-12-31'] # 
list_test_indicator = ['I_A_D_S_T_BP6_USD']
list_test_reporter = ['PT']
list_test_partner = ['US', 'IT'] # ['IT'] # 

#display(ser_asset_filtered.loc[list_test_date, list_test_reporter, list_test_partner, list_test_indicator])
display(ser_asset_commerce.loc[list_test_date, list_test_reporter, list_test_partner, :])

Date        Reporter  Partner  Indicator        
1997-12-31  PT        IT       I_A_D_S_T_BP6_USD    0.000000    
                               I_A_T_T_T_BP6_USD    507.593262  
                      US       I_A_D_S_T_BP6_USD    91.923386   
                               I_A_T_T_T_BP6_USD    3332.449951 
2021-12-31  PT        IT       I_A_D_S_T_BP6_USD    3289.410400 
                               I_A_T_T_T_BP6_USD    18311.003906
                      US       I_A_D_S_T_BP6_USD    3.329844    
                               I_A_T_T_T_BP6_USD    13202.219727
Name: Value, dtype: float64

In [27]:
### IMF CPIS: SHORT-TERM DEBTS EXCLUDING

gc.collect()

def reindex_indicators(ser_group):
    ser_group = ser_group.droplevel(['Date', 'Reporter', 'Partner'])
    if (len(ser_group.index) < 3):
        ser_group = ser_group.reindex(list_valid_indicators).fillna(0.0)
    ser_indicators_cleared = ser_group['I_A_T_T_T_BP6_USD'] - ser_group['I_A_D_S_T_BP6_USD']
    return ser_indicators_cleared

#list_test_date = ['1997-12-31', '2021-12-31'] # ['2021-12-31'] # 
#list_test_reporter = ['PT']
#list_test_partner = ['US', 'IT'] # ['IT'] # 

#ser_test_raw = ser_asset_commerce.loc[:, list_test_reporter, :, :]
#display(ser_test_raw.loc[list_test_date, list_test_reporter, list_test_partner, :])
#ser_test_res = ser_test_raw.groupby(['Date', 'Reporter', 'Partner']).apply(reindex_indicators)
#display(ser_test_res.loc[list_test_date, list_test_reporter, list_test_partner])

ser_asset_minus_short_term = ser_asset_commerce.groupby(['Date', 'Reporter', 'Partner']).apply(reindex_indicators)

In [29]:
### IMF CPIS: ASSET CLEARING TEST

list_test_date = ['1997-12-31', '2021-12-31'] # ['2021-12-31'] # 
list_test_reporter = ['PT']
list_test_partner = ['US', 'IT'] # ['IT'] # 

display(ser_asset_commerce.loc[list_test_date, list_test_reporter, list_test_partner])
display(ser_asset_minus_short_term.loc[list_test_date, list_test_reporter, list_test_partner])

Date        Reporter  Partner  Indicator        
1997-12-31  PT        IT       I_A_D_S_T_BP6_USD    0.000000    
                               I_A_T_T_T_BP6_USD    507.593262  
                      US       I_A_D_S_T_BP6_USD    91.923386   
                               I_A_T_T_T_BP6_USD    3332.449951 
2021-12-31  PT        IT       I_A_D_S_T_BP6_USD    3289.410400 
                               I_A_T_T_T_BP6_USD    18311.003906
                      US       I_A_D_S_T_BP6_USD    3.329844    
                               I_A_T_T_T_BP6_USD    13202.219727
Name: Value, dtype: float64

Date        Reporter  Partner
1997-12-31  PT        IT         507.593262  
                      US         3240.526566 
2021-12-31  PT        IT         15021.593506
                      US         13198.889883
Name: Value, dtype: float64

In [30]:
### IMF CPIS: FILTERED ASSET DATASET SAVING

ser_asset_minus_short_term.replace({0.0: np.NaN})\
    .to_hdf(path_or_buf = str_path_imf_cpis_filtered, key = str_key_imf_cpis_filtered_asset, mode = 'w', format = 'fixed')

In [35]:
### IMF CPIS: RAW LIABILITY DATA LOADING

gc.collect()
ser_cpis_liability_inv = pd.read_hdf(path_or_buf = str_path_imf_cpis_detailed_raw, key = str_key_imf_cpis_liabilities)

In [38]:
### IMF CPIS: LIABILITY FILTERING

gc.collect()

list_valid_indicators = ['I_L_T_T_T_BP6_USD', 'I_L_D_S_T_BP6_USD']
list_valid_reporter_sectors = ['T', 'CB', 'GG']
list_valid_partner_sectors = ['T']
list_valid_partners = df_country_codes['ISO SHORT'].values

ser_liability_filtered = ser_cpis_liability_inv.loc[:, list_valid_indicators, list_valid_reporter_sectors, list_valid_partner_sectors, :, list_valid_partners]\
                     .droplevel('Partner_Sector').reorder_levels([0, 1, 3, 4, 2]).sort_index().astype('float32')
del ser_cpis_liability_inv

In [40]:
### IMF CPIS: LIABILITY REPORTER SECTOR CLEARING

gc.collect()

def reindex_reporter_sectors(ser_group):
    ser_group = ser_group.droplevel(['Date', 'Indicator', 'Reporter', 'Partner'])
    if (len(ser_group.index) < 3):
        ser_group = ser_group.reindex(list_valid_reporter_sectors).fillna(0.0)
    ser_sectors_cleared = ser_group['T'] - (ser_group['CB'] + ser_group['GG'])
    return ser_sectors_cleared

def get_commerce(ser_raw):
    return ser_raw.groupby(['Date', 'Indicator', 'Reporter', 'Partner']).apply(reindex_reporter_sectors)

ser_liability_commerce = ser_liability_filtered.groupby(['Date', 'Indicator', 'Reporter', 'Partner']).apply(reindex_reporter_sectors)
ser_liability_commerce = ser_liability_commerce.reorder_levels([0, 2, 3, 1]).sort_index()

In [54]:
### IMF CPIS: LIABILITY CLEARING TEST

list_test_date = ['1997-12-31', '2021-12-31'] # ['2021-12-31'] # 
list_test_indicator = ['I_L_D_S_T_BP6_USD']
list_test_reporter = ['AU']
list_test_partner = ['JP', 'IL'] # ['IT'] # 

#display(ser_liability_commerce.loc[list_test_date, list_test_reporter, list_test_partner, list_test_indicator])
display(ser_liability_commerce.loc[list_test_date, list_test_reporter, list_test_partner, :])

Date        Reporter  Partner  Indicator        
1997-12-31  AU        IL       I_L_D_S_T_BP6_USD    0.000000    
                               I_L_T_T_T_BP6_USD    3.612000    
                      JP       I_L_D_S_T_BP6_USD    143.131973  
                               I_L_T_T_T_BP6_USD    9109.657227 
2021-12-31  AU        IL       I_L_T_T_T_BP6_USD    132.000000  
                      JP       I_L_D_S_T_BP6_USD    17969.404297
                               I_L_T_T_T_BP6_USD    59019.222656
Name: Value, dtype: float64

In [55]:
### IMF CPIS: SHORT-TERM DEBTS EXCLUDING

gc.collect()

def reindex_indicators(ser_group):
    ser_group = ser_group.droplevel(['Date', 'Reporter', 'Partner'])
    if (len(ser_group.index) < 3):
        ser_group = ser_group.reindex(list_valid_indicators).fillna(0.0)
    ser_indicators_cleared = ser_group['I_L_T_T_T_BP6_USD'] - ser_group['I_L_D_S_T_BP6_USD']
    return ser_indicators_cleared

ser_liability_minus_short_term = ser_liability_commerce.groupby(['Date', 'Reporter', 'Partner']).apply(reindex_indicators)

In [56]:
### IMF CPIS: LIABILITY CLEARING TEST

list_test_date = ['1997-12-31', '2021-12-31'] # ['2021-12-31'] # 
list_test_reporter = ['AU']
list_test_partner = ['JP', 'IL'] # ['IT'] # 

display(ser_liability_commerce.loc[list_test_date, list_test_reporter, list_test_partner])
display(ser_liability_minus_short_term.loc[list_test_date, list_test_reporter, list_test_partner])

Date        Reporter  Partner  Indicator        
1997-12-31  AU        IL       I_L_D_S_T_BP6_USD    0.000000    
                               I_L_T_T_T_BP6_USD    3.612000    
                      JP       I_L_D_S_T_BP6_USD    143.131973  
                               I_L_T_T_T_BP6_USD    9109.657227 
2021-12-31  AU        IL       I_L_T_T_T_BP6_USD    132.000000  
                      JP       I_L_D_S_T_BP6_USD    17969.404297
                               I_L_T_T_T_BP6_USD    59019.222656
Name: Value, dtype: float64

Date        Reporter  Partner
1997-12-31  AU        IL         3.612000    
                      JP         8966.525253 
2021-12-31  AU        IL         132.000000  
                      JP         41049.818359
Name: Value, dtype: float64

In [57]:
### IMF CPIS: LIABILITY ASSET DATASET SAVING

ser_liability_minus_short_term.replace({0.0: np.NaN})\
    .to_hdf(path_or_buf = str_path_imf_cpis_filtered, key = str_key_imf_cpis_filtered_liability, mode = 'a', format = 'fixed')

In [22]:
### IMF CPIS: FILTERED DATA AGGREGATION: DATASETS LOADING

gc.collect()

ser_cpis_asset_filtered = pd.read_hdf(path_or_buf = str_path_imf_cpis_filtered, key = str_key_imf_cpis_filtered_asset)
ser_cpis_asset_filtered.name = 'Asset'
ser_cpis_liability_filtered = pd.read_hdf(path_or_buf = str_path_imf_cpis_filtered, key = str_key_imf_cpis_filtered_liability)
ser_cpis_liability_filtered.name = 'Liability_Inverted'
df_cpis_total = pd.concat([ser_cpis_asset_filtered, ser_cpis_liability_filtered], axis = 1, names = 'Data Source').astype('float32').round(2)

In [23]:
### IMF CPIS: FILTERED DATA AGGREGATION: DATA QUALITY RATIOS

gc.collect()

### Defining similarity for investors by date
def get_investor_ratio(df_group):
#    df_group['Asset'].fillna(0.0, inplace = True)
    df_group.fillna(0.0, inplace = True)    
    df_both = df_group.dropna()    
    if (df_both['Asset'].sum() > 0.0):
        flo_result = (df_both['Asset'] - df_both['Liability_Inverted']).abs().clip(upper = df_group['Asset'].max()).sum() / df_group['Asset'].sum() / len(df_group)    
    else:
        flo_result = np.NaN    
    return flo_result
### Defining similarity for borrowers by date
def get_borrower_ratio(df_group):
#    df_group['Liability_Inverted'].fillna(0.0, inplace = True)
    df_group.fillna(0.0, inplace = True)     
    df_both = df_group.dropna()
    if (df_both['Liability_Inverted'].sum() > 0.0):
        flo_result = (df_both['Asset'] - df_both['Liability_Inverted']).abs().clip(upper = df_group['Liability_Inverted'].max()).sum() \
                                                                        / df_group['Liability_Inverted'].sum() / len(df_group)
    else:
        flo_result = np.NaN    
    return flo_result
### Similarity values calculation:
ser_investor_ratio = df_cpis_total.groupby(['Date', 'Reporter']).apply(get_investor_ratio)
ser_investor_ratio.name = 'Investor_Ratio'
ser_borrower_ratio = df_cpis_total.groupby(['Date', 'Partner']).apply(get_borrower_ratio)
ser_borrower_ratio.name = 'Borrower_Ratio'

In [24]:
### IMF CPIS: FILTERED DATA AGGREGATION: SIMILARITY TEST

print(round(ser_borrower_ratio.min(), 4), '/', ser_borrower_ratio.idxmin())
print(round(ser_borrower_ratio.max(), 4), '/', ser_borrower_ratio.idxmax())

#display(df_cpis_total.loc[('2018-12-31', All, 'RO'), :])
#display(df_cpis_total.loc[('1997-12-31', All, 'ES'), :])

0.0022 / (Timestamp('2018-12-31 00:00:00'), 'RO')
0.8929 / (Timestamp('1997-12-31 00:00:00'), 'ES')


In [25]:
### IMF CPIS: FILTERED DATA AGGREGATION: SIMILARITY TEST

print(round(ser_investor_ratio.min(), 4), '/', ser_investor_ratio.idxmin())
print(round(ser_investor_ratio.max(), 4), '/', ser_investor_ratio.idxmax())

#display(df_cpis_total.loc[('2010-12-31', 'NZ', All), :].dropna(subset = ['Liability_Inverted']))
#display(df_cpis_total.loc[('2002-12-31', 'UA', All), :])

0.0024 / (Timestamp('2010-12-31 00:00:00'), 'NZ')
0.25 / (Timestamp('2002-12-31 00:00:00'), 'UA')


In [26]:
### IMF CPIS: FILTERED DATA AGGREGATION: ADDING RATIOS

df_cpis_to_augment = df_cpis_total.join(ser_investor_ratio).join(ser_borrower_ratio)
df_cpis_to_augment['Asset_Augmented'] = np.NaN # -999 # 
df_cpis_to_augment = df_cpis_to_augment.reorder_levels(['Date', 'Reporter', 'Partner'])

In [27]:
### IMF CPIS: FILTERED DATA AGGREGATION: CONDITIONAL REPLACING

gc.collect()
def augment_by_date(df_date, int_option = 0):
    '''
       -1 : Replace NaN Asset values unconditionally
        0 : Replace NaN Asset values when Investor's Ratio > Borrower's Ratio
        1 : Replace NaN or zero Asset values when Investor's Ratio > Borrower's Ratio
        2 : Replace any Asset values when Investor's Ratio > Borrower's Ratio
    '''
    if (int_option == -1):
        ### Replacing zero Asset & Liability values with NaN:
        df_date.loc[df_date['Asset'] == 0.0, 'Asset'] = np.NaN        
        df_date['Asset_Augmented'] = df_date['Asset'].combine_first(df_date['Liability_Inverted'])
    elif (int_option == 0):
        ### Fill resulting column with not NaN Asset values:
        df_date.loc[df_date['Asset'].notna(), 'Asset_Augmented'] = df_date[df_date['Asset'].notna()]['Asset'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is NaN (and doesn't matter if Borrower Ratio is NaN):
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna(), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna()]['Liability_Inverted'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is bigger than Borrower Ratio:
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio']), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio'])]['Liability_Inverted'].values
    elif (int_option == 1):
        ### Replacing zero Asset values with NaN:
        df_date.loc[df_date['Asset'] == 0.0, 'Asset'] = np.NaN
        ### Fill resulting column with not NaN Asset values:
        df_date.loc[df_date['Asset'].notna(), 'Asset_Augmented'] = df_date.loc[df_date['Asset'].notna(), 'Asset'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is NaN (and doesn't matter if Borrower Ratio is NaN):
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna(), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].isna()]['Liability_Inverted'].values
        ### Fill resulting column with Liability value if Asset value is NaN & Investor Ratio is bigger than Borrower Ratio:
        df_date.loc[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio']), 'Asset_Augmented'] = \
            df_date[df_date['Asset'].isna() & df_date['Investor_Ratio'].notna() & df_date['Borrower_Ratio'].notna() & \
                    (df_date['Investor_Ratio'] > df_date['Borrower_Ratio'])]['Liability_Inverted'].values
    else:
        ### Replacing zero Asset & Liability values with NaN:
        df_date.loc[df_date['Asset'] == 0.0, 'Asset'] = np.NaN        
        df_date.loc[df_date['Liability_Inverted'] == 0.0, 'Liability_Inverted'] = np.NaN
        ### Ratios preparation:
        df_date.loc[df_date['Investor_Ratio'].isna(), 'Investor_Ratio'] = 999.0
        df_date.loc[df_date['Borrower_Ratio'].isna(), 'Borrower_Ratio'] = 1000.0
        ### Ratios comparision:
        df_date.loc[df_date['Investor_Ratio'] <= df_date['Borrower_Ratio'], 'Asset_Augmented'] = \
            df_date[df_date['Investor_Ratio'] <= df_date['Borrower_Ratio']]['Asset'].values
        df_date.loc[df_date['Investor_Ratio'] > df_date['Borrower_Ratio'], 'Asset_Augmented'] = \
            df_date[df_date['Investor_Ratio'] > df_date['Borrower_Ratio']]['Liability_Inverted'].values                                       
    return df_date

dict_cpis_augmented = {}
dict_cpis_augmented[-1] = df_cpis_to_augment.groupby('Date').apply(augment_by_date, -1)
dict_cpis_augmented[0] = df_cpis_to_augment.groupby('Date').apply(augment_by_date, 0)
dict_cpis_augmented[1] = df_cpis_to_augment.groupby('Date').apply(augment_by_date, 1)
dict_cpis_augmented[2] = df_cpis_to_augment.groupby('Date').apply(augment_by_date, 2)

In [68]:
### IMF CPIS: FILTERED DATA AGGREGATION: RESULTS TESTING

ser_quantity = df_cpis_to_augment['Asset'].replace({0.0: np.NaN}).groupby(['Date','Reporter']).count()
ser_quantity = ser_quantity[ser_quantity > 0].groupby('Date').count()
ser_quantity.name = 'raw'
ser_quantity.to_excel('Data_Files/Test_Files/Augmentation_Test_' + ser_quantity.name + '.xlsx', merge_cells = False)

for iter_option in dict_cpis_augmented:
    dt_option = dict_cpis_augmented[iter_option]
    ser_quantity = dt_option['Asset_Augmented'].replace({0.0: np.NaN}).groupby(['Date','Reporter']).count()
    ser_quantity = ser_quantity[ser_quantity > 0].groupby('Date').count()
    ser_quantity.name = str(iter_option)
    ser_quantity.to_excel('Data_Files/Test_Files/Augmentation_Test_' + str(iter_option) + '.xlsx', merge_cells = False)

In [69]:
### IMF CPIS: FILTERED DATA AGGREGATION: RESULTS SAVING (SERIES)

ser_filtered_augmented = dict_cpis_augmented[2]['Asset_Augmented']
ser_filtered_augmented.name = 'Filtered'
ser_filtered_augmented.replace({0.0: np.NaN})\
    .to_hdf(path_or_buf = str_path_imf_cpis_filtered_augmented, key = str_key_imf_cpis_filtered_augmented, mode = 'w', format = 'fixed')

In [28]:
### IMF CPIS: FILTERED DATA AGGREGATION: RESULTS CONSOLIDATION TO DATAFRAME AND SAVING

df_augmentation_way = pd.concat([df_cpis_to_augment['Asset'].replace({0.0: np.NaN}), 
                                 dict_cpis_augmented[-1]['Asset_Augmented'], 
                                 dict_cpis_augmented[2]['Asset_Augmented']], 
                                axis = 1, keys = ['Assets_Only', 'Unconditional', 'Option_2'], names = 'Augmentation_Way')
df_augmentation_way.to_hdf(path_or_buf = str_path_filtered_imf_cpis_options, key = str_key_filtered_imf_cpis_options, mode = 'w', format = 'fixed')

In [30]:
### IMF CPIS: INVESTMENT POSITIONS AGGREGATING AND SAVING

df_total_direct_options = pd.read_hdf(path_or_buf = str_path_total_direct_options, key = str_key_total_direct_options)
df_equity_direct_options = pd.read_hdf(path_or_buf = str_path_equity_direct_options, key = str_key_equity_direct_options)
df_total_portfolio_options = pd.read_hdf(path_or_buf = str_path_total_imf_cpis_options, key = str_key_total_imf_cpis_options)
df_filtered_portfolio_options = pd.read_hdf(path_or_buf = str_path_filtered_imf_cpis_options, key = str_key_filtered_imf_cpis_options)
df_total_investment_options = (df_total_direct_options.fillna(0.0) + df_total_portfolio_options.fillna(0.0)).replace({0.0: np.NaN})
df_total_investment_options.to_hdf(path_or_buf = str_path_total_investment_options, key = str_key_total_investment_options, mode = 'w', format = 'fixed')
df_filtered_investment_options = (df_equity_direct_options.fillna(0.0) + df_filtered_portfolio_options.fillna(0.0)).replace({0.0: np.NaN})
df_filtered_investment_options.to_hdf(path_or_buf = str_path_filtered_investment_options, key = str_key_filtered_investment_options, mode = 'w', format = 'fixed')

In [None]:
### TEMP

display(df_filtered_portfolio_options['Unconditional'].groupby('Date').count() / df_total_portfolio_options['Unconditional'].groupby('Date').count())
display(df_filtered_portfolio_options['Unconditional'].groupby('Date').sum() / df_total_portfolio_options['Unconditional'].groupby('Date').sum())