In [1]:
### CPIS DOTS DATA EXPORT : PARTNERS FROM THE WHOLE WORLD

In [2]:
### RUN EVERY TIME: INITIALIZATION

import pandas as pd
from pandas.plotting import register_matplotlib_converters
pd.set_option('display.max_colwidth', -1) ### To display long strings
register_matplotlib_converters()
import numpy as np
import math
import itertools
import requests
import json ### To correct JSON structure before unpacking
import gc
import os
import datetime
import time
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import seaborn as sns
#%load_ext line_profiler

In [3]:
### RUN EVERY TIME: VERSION CONTROL

from platform import python_version
print('pandas version: ', pd.__version__)
print('python version: ', python_version())

pandas version:  0.25.3
python version:  3.7.4


In [4]:
### RUN EVERY TIME: MAIN CONSTANTS

### MultiIndex level slice constant:
All = slice(None)
### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### IMF DOTS dataset:
str_path_imf_dots_world = 'Data_Files/Source_Files/dots_world_export.h5'
str_full_imf_dots_world = 'dots_world_export'
str_path_imf_dots_dataset = 'Data_Files/Source_Files/dots_dataset.h5'
str_key_imf_dots_export = 'dots_export'
str_key_imf_dots_import_inverted = 'dots_import_inverted'
str_path_imf_dots_combined = 'Data_Files/Source_Files/dots_combined.h5'
str_key_imf_dots_full = 'dots_full_combined'
str_path_imf_dots_options = 'Data_Files/Source_Files/dots_options.h5'
str_key_do_total_imf_dots_options = 'dots_export_options'
### Technical Constants:
str_date_end = '2022-10-31'
date_start = pd.Timestamp('1989-12-29')
date_end = pd.Timestamp(str_date_end)
date_ison = pd.Timestamp('1994-12-31')

In [5]:
### DEFINING COUNTRY CODES EXTRACTOR

def get_country_codes(use_local_copy = False):  
    ### In case if URL is unavailable:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/countrycode.html'
    ### Online extraction:
    else:
        url_country_code = 'https://countrycode.org/'
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    df_result = df_full_codes[['ISO SHORT', 'ISO LONG']].sort_index()    
    df_result.index = df_result.index.str.upper()
    ### Results output:
    return df_result

In [6]:
### DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE (TO BE IGNORED IN PRODUCT CODE)

def ison_membership_converting(str_path_universe, date_end, bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(engine = 'openpyxl', io = str_path_universe, sheet_name = 'Switchers', header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

In [7]:
### RUN EVERY TIME: COMMON DATA EXTRACTION STEPS

### World Country Codes:
df_country_codes = get_country_codes()
### ISON membership history:
ser_ison_membership = ison_membership_converting(str_path_universe, pd.to_datetime(str_date_end))
### ISON LONG IDs list:
list_ison_long = list(df_country_codes.loc[df_country_codes['ISO SHORT'].isin(ser_ison_membership.index.get_level_values('Country').unique()), 'ISO LONG'].values)
### ISON current status:
ser_ison_status = ser_ison_membership.loc[str_date_end].droplevel('Date')
### ISON stats:
int_ison_number = len(list_ison_long)
list_regions = ['DM', 'EM', 'FM']
dict_ison_len = {}
dict_ison_len['Full Universe'] = int_ison_number
for iter_region in list_regions:
    dict_ison_len[iter_region] = len(ser_ison_status[ser_ison_status == iter_region])
ser_market_len = pd.Series(dict_ison_len)
ser_market_len.index.names = ['Market']    

In [8]:
### IMF DOTS: BILATERAL EXPORTS & IMPORTS (MILLIONS OF USD)

In [15]:
### IMF DOTS: GENERAL DATA PREPARATION

### Constants:
All = slice(None)
dict_request_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
str_imf_base_url = 'http://dataservices.imf.org/REST/SDMX_JSON.svc/'
str_imf_dataflow_add = 'DataFlow'
str_imf_datastructure_add = 'DataStructure/'
str_imf_codelist_add = 'CodeList/'
str_imf_dataset_add = 'CompactData/'
int_seconds_to_sleep = 6
int_imf_country_limit = 30

In [10]:
### IMF DOTS: REQUESTS SESSION INITIALIZING

request_session = requests.Session()
### For avoiding data request errors from IMF Data Service:
request_session.headers.update(dict_request_headers)

In [11]:
### IMF DOTS: DATAFLOW SEARCHING

obj_imf_dataflow_list = request_session.get(str_imf_base_url + str_imf_dataflow_add).json()
df_imf_dataflow = pd.DataFrame(obj_imf_dataflow_list['Structure']['Dataflows']['Dataflow'])
df_imf_dataflow = df_imf_dataflow.assign(Description = df_imf_dataflow['Name'].apply(pd.Series)['#text'].values)[['@id', 'Description']]
ser_imf_dataflow = df_imf_dataflow.set_index('@id', drop = True).squeeze()
### Searching DataFlow code for further requests:
str_imf_dots_id = ser_imf_dataflow[ser_imf_dataflow.str.endswith('(DOTS)')].index[0].replace('DS-', '')
print(str_imf_dots_id)

DOT


In [12]:
### IMF DOTS: DATASTRUCTURE SEARCHING

obj_imf_dots_structure = request_session.get(str_imf_base_url + str_imf_datastructure_add + str_imf_dots_id).json()
df_imf_dots_params = pd.DataFrame(obj_imf_dots_structure['Structure']['KeyFamilies']['KeyFamily']['Components']['Dimension'])\
                                [['@conceptRef', '@codelist', '@isFrequencyDimension']]
### Receiving DataFlow parameters and code lists for each of them:
print(df_imf_dots_params)

        @conceptRef                @codelist @isFrequencyDimension
0  FREQ              CL_FREQ                  true                
1  REF_AREA          CL_AREA_DOT              NaN                 
2  INDICATOR         CL_INDICATOR_DOT         NaN                 
3  COUNTERPART_AREA  CL_COUNTERPART_AREA_DOT  NaN                 


In [13]:
### IMF DOTS: CODES DESCRIPTION SEARCHING

for int_counter, str_param_code in enumerate(df_imf_dots_params['@codelist']):
    if (int_counter == 2):
        time.sleep(int_seconds_to_sleep)    
        obj_imf_dots_param = request_session.get(str_imf_base_url + str_imf_codelist_add + str_param_code).json()
        df_imf_dots_param =  pd.DataFrame(obj_imf_dots_param['Structure']['CodeLists']['CodeList']['Code'])
        ### Receiving values for each code list:
        df_imf_dots_param = df_imf_dots_param.assign(Text = df_imf_dots_param['Description'].apply(pd.Series)['#text'].values)[['@value', 'Text']]
        print(int_counter, ':', df_imf_dots_params.iloc[int_counter, All]['@conceptRef'], ':', str_param_code, ':') 
        display(df_imf_dots_param)
    
str_dots_freq = 'M' # 'A' # 'B' # 
#str_dots_indicator = 
list_dots_indicator = ['TXG_FOB_USD', 'TMG_CIF_USD', 'TMG_FOB_USD', 'TBG_USD']
list_ison_countries = sorted(list(map(str, ser_ison_membership.index.get_level_values(1).unique())))

2 : INDICATOR : CL_INDICATOR_DOT :


Unnamed: 0,@value,Text
0,TXG_FOB_USD,"Goods, Value of Exports, Free on board (FOB), US Dollars"
1,TMG_CIF_USD,"Goods, Value of Imports, Cost, Insurance, Freight (CIF), US Dollars"
2,TMG_FOB_USD,"Goods, Value of Imports, Free on board (FOB), US Dollars"
3,TBG_USD,"Goods, Value of Trade Balance, US Dollars"


In [None]:
### IMF DOTS: INDICATORS EXTRACTION: WORLD EXPORT

dict_dots_bilateral = {} # Global container
str_dots_const_url = str_imf_base_url + str_imf_dataset_add + str_imf_dots_id + '/' # Beginning of request URL
### Export Indicator defining:
str_dots_indicator = 'TXG_FOB_USD'
### Session initializing:
request_session = requests.Session()
request_session.headers.update(dict_request_headers)    
### List of bilateral dataframes for future concatenation
list_dots_bilateral = [] 
### Looping for reporter country groups:
for int_ison_reporter_part in range(0, - (len(list_ison_countries) // ( - int_imf_country_limit))):
    str_dots_reporters = '+'.join(list_ison_countries[int_ison_reporter_part * int_imf_country_limit : (int_ison_reporter_part + 1) * int_imf_country_limit])
    ### World as a partner:
    str_dots_partners = 'W00'
    ### Generating complete request URL:
    str_dots_full_url = str_dots_const_url + '.'.join([str_dots_freq, str_dots_reporters, str_dots_indicator, str_dots_partners])
    ### Receiving DOTS dataset from IMF API:
    print(str_dots_indicator, '(WORLD) :', int_ison_reporter_part * int_imf_country_limit, '-', (int_ison_reporter_part + 1) * int_imf_country_limit)        
    obj_dots_set = request_session.get(str_dots_full_url)
    ### Data reading as JSON:
    dict_dots_set = json.loads(obj_dots_set.text)
    ### Converting each bilateral dataset to dataframe and it's mungling:
    for dict_dots_pair in dict_dots_set['CompactData']['DataSet']['Series']:
        if isinstance(dict_dots_pair['Obs'], list):
            df_dots_bilateral = pd.DataFrame(dict_dots_pair['Obs'])
        else:
            df_dots_bilateral = pd.DataFrame([dict_dots_pair['Obs']])
        ### Markers checking:
        if '@OBS_STATUS' not in df_dots_bilateral.columns:
            df_dots_bilateral['@OBS_STATUS'] = np.NaN
        ### Data extracting and mungling:
        df_dots_bilateral = df_dots_bilateral[['@TIME_PERIOD', '@OBS_VALUE', '@OBS_STATUS']]
        df_dots_bilateral.columns = ['Date', 'Value', 'Status']
        df_dots_bilateral = df_dots_bilateral.assign(Reporter_ID = dict_dots_pair['@REF_AREA'])
        df_dots_bilateral = df_dots_bilateral.assign(Partner_ID = dict_dots_pair['@COUNTERPART_AREA'])
        list_dots_bilateral.append(df_dots_bilateral)
#        break
#    break
### Flow level data aggregation:
df_dots_indicator = pd.concat(list_dots_bilateral, axis = 0, ignore_index = True)
df_dots_indicator['Date'] = pd.to_datetime(df_dots_indicator['Date']) + pd.offsets.BMonthEnd()
ser_dots_indicator = df_dots_indicator.set_index(['Date', 'Reporter_ID']).sort_index().drop(['Partner_ID', 'Status'], axis = 1).squeeze().astype(float)
ser_dots_indicator.index.names = ['Date', 'Reporter']
ser_dots_indicator.name = 'World_Export'
### Dataset saving:
ser_dots_indicator.to_hdf(path_or_buf = str_path_imf_dots_world, key = str_full_imf_dots_world, mode = 'w', format = 'fixed')
gc.collect()

In [None]:
### IMF DOTS: EXPORT DATA EXTRACTION: BILATERAL FLOWS

dict_dots_bilateral = {} # Global container
str_dots_const_url = str_imf_base_url + str_imf_dataset_add + str_imf_dots_id + '/' # Beginning of request URL
str_dots_indicator = list_dots_indicator[0]
### Session initializing:
request_session = requests.Session()
request_session.headers.update(dict_request_headers)    
### List of bilateral dataframes for future concatenation
list_dots_bilateral = [] 
### Looping over reporters:
for str_reporter in list_ison_countries:
    ### Generating complete request URL:
    str_dots_full_url = str_dots_const_url + '.'.join([str_dots_freq, str_reporter, str_dots_indicator])
    ### Receiving DOTS dataset from IMF API:
    print(str_reporter, ' / ', str_dots_indicator)
    obj_dots_set = request_session.get(str_dots_full_url)
    ### Data reading as JSON:
    dict_dots_set = json.loads(obj_dots_set.text)
    if ('Series' in dict_dots_set['CompactData']['DataSet']):
        ### Converting each bilateral dataset to dataframe and it's mungling:
        for dict_dots_pair in dict_dots_set['CompactData']['DataSet']['Series']:
            if isinstance(dict_dots_pair['Obs'], list):
                df_dots_bilateral = pd.DataFrame(dict_dots_pair['Obs'])
            else:
                df_dots_bilateral = pd.DataFrame([dict_dots_pair['Obs']])
            ### Markers checking:
            if '@OBS_STATUS' not in df_dots_bilateral.columns:
                df_dots_bilateral['@OBS_STATUS'] = np.NaN
            ### Data extracting and mungling:
            df_dots_bilateral = df_dots_bilateral[['@TIME_PERIOD', '@OBS_VALUE', '@OBS_STATUS']]
            df_dots_bilateral.columns = ['Date', 'Value', 'Status']
            df_dots_bilateral = df_dots_bilateral.assign(Reporter_ID = dict_dots_pair['@REF_AREA'])
            df_dots_bilateral = df_dots_bilateral.assign(Partner_ID = dict_dots_pair['@COUNTERPART_AREA'])
            list_dots_bilateral.append(df_dots_bilateral)
### Flow level data aggregation:
df_dots_indicator = pd.concat(list_dots_bilateral, axis = 0, ignore_index = True)
df_dots_indicator['Date'] = pd.to_datetime(df_dots_indicator['Date']) + pd.offsets.BMonthEnd()
df_dots_indicator = df_dots_indicator[df_dots_indicator['Partner_ID'].isin(df_country_codes['ISO SHORT'].values)].drop('Status', axis = 1)
print('Unique partners number:', len(df_dots_indicator['Partner_ID'].unique()))
df_dots_indicator.rename({'Reporter_ID': 'Reporter', 'Partner_ID': 'Partner'}, axis = 1, inplace = True)
### Data saving:
ser_dots_export = df_dots_indicator.set_index(['Date', 'Reporter', 'Partner'])['Value'].sort_index().astype('float16')
del df_dots_indicator
gc.collect()
ser_dots_export.to_hdf(path_or_buf = str_path_imf_dots_dataset, key = str_key_imf_dots_export, mode = 'w', format = 'fixed')

In [None]:
### IMF DOTS: IMPORT DATA EXTRACTION: BILATERAL FLOWS

gc.collect()
dict_dots_bilateral = {} # Global container
str_dots_const_url = str_imf_base_url + str_imf_dataset_add + str_imf_dots_id + '/' # Beginning of request URL
str_dots_indicator = list_dots_indicator[1]
### Session initializing:
request_session = requests.Session()
request_session.headers.update(dict_request_headers)    
### List of bilateral dataframes for future concatenation
list_dots_bilateral = [] 
### Looping over reporters:
for str_reporter in list_ison_countries:
    ### Generating complete request URL:
    str_dots_full_url = str_dots_const_url + '.'.join([str_dots_freq, '', str_dots_indicator, str_reporter])
    ### Receiving DOTS dataset from IMF API:
    print(str_reporter, ' / ', str_dots_indicator)
    obj_dots_set = request_session.get(str_dots_full_url)
    ### Data reading as JSON:
    dict_dots_set = json.loads(obj_dots_set.text)
    if ('Series' in dict_dots_set['CompactData']['DataSet']):
        ### Converting each bilateral dataset to dataframe and it's mungling:
        for dict_dots_pair in dict_dots_set['CompactData']['DataSet']['Series']:
            if isinstance(dict_dots_pair['Obs'], list):
                df_dots_bilateral = pd.DataFrame(dict_dots_pair['Obs'])
            else:
                df_dots_bilateral = pd.DataFrame([dict_dots_pair['Obs']])
            ### Markers checking:
            if '@OBS_STATUS' not in df_dots_bilateral.columns:
                df_dots_bilateral['@OBS_STATUS'] = np.NaN
            ### Data extracting and mungling:
            df_dots_bilateral = df_dots_bilateral[['@TIME_PERIOD', '@OBS_VALUE', '@OBS_STATUS']]
            df_dots_bilateral.columns = ['Date', 'Value', 'Status']
            df_dots_bilateral = df_dots_bilateral.assign(Reporter_ID = dict_dots_pair['@REF_AREA'])
            df_dots_bilateral = df_dots_bilateral.assign(Partner_ID = dict_dots_pair['@COUNTERPART_AREA'])
            list_dots_bilateral.append(df_dots_bilateral)
### Flow level data aggregation:
df_dots_indicator = pd.concat(list_dots_bilateral, axis = 0, ignore_index = True)
df_dots_indicator['Date'] = pd.to_datetime(df_dots_indicator['Date']) + pd.offsets.BMonthEnd()
df_dots_indicator = df_dots_indicator[df_dots_indicator['Reporter_ID'].isin(df_country_codes['ISO SHORT'].values)].drop('Status', axis = 1)
print('Unique reporters number:', len(df_dots_indicator['Reporter_ID'].unique()))
df_dots_indicator.rename({'Reporter_ID': 'Partner', 'Partner_ID': 'Reporter'}, axis = 1, inplace = True)
### Data saving:
ser_dots_import_inv = df_dots_indicator.set_index(['Date', 'Reporter', 'Partner'])['Value'].sort_index().astype('float16')
del df_dots_indicator
gc.collect()
ser_dots_import_inv.to_hdf(path_or_buf = str_path_imf_dots_dataset, key = str_key_imf_dots_import_inverted, mode = 'a', format = 'fixed')

In [8]:
### IMF DOTS: EXPORT & IMPORT DATA AGGREGATION: DATASETS LOADING

gc.collect()
ser_dots_export = pd.read_hdf(path_or_buf = str_path_imf_dots_dataset, key = str_key_imf_dots_export)
ser_dots_import_inv = pd.read_hdf(path_or_buf = str_path_imf_dots_dataset, key = str_key_imf_dots_import_inverted)
df_export_aug = pd.concat([ser_dots_export, ser_dots_import_inv], axis = 1, names = 'Source Flow', keys = ['Export', 'Import'])
#df_export_aug = df_export_aug.join(ser_ison_status, on = 'Reporter').set_index('Market', append = True)

In [9]:
### IMF DOTS: CIF COEFFICIENTS CALCULATION

gc.collect()
### Bounds to filter bilateral Import to Export ratio before median calculation:
flo_lower_bound = 1.0
flo_upper_bound = 2.0
### Bilateral median calculation procedure:
def get_obs_median(df_comm):
    ### Export to Import ratio:
    ser_obs_coeff = df_comm['Import'] / df_comm['Export']
    ### Ratio filtering:
    ser_obs_coeff = ser_obs_coeff.loc[(ser_obs_coeff >= flo_lower_bound) & (ser_obs_coeff <= flo_upper_bound)]
    ### Filtered timeseries median as a result:
    return round(ser_obs_coeff.median(), 2)

### Calulation CIF coefficient for all commodities:
ser_cif_median = df_export_aug.groupby(['Reporter', 'Partner']).apply(get_obs_median)
ser_cif_median.fillna(ser_cif_median.median(), inplace = True)
ser_cif_median.name = 'CIF_Coefficient'

In [12]:
### IMF DOTS: IMPORT DATA CORRECTION:

### Adding CIF coefficients to dataset:
df_export_cif = df_export_aug.merge(ser_cif_median, left_index = True, right_index = True)
del df_export_aug
gc.collect()
df_export_cif = df_export_cif.reorder_levels(['Date', 'Reporter', 'Partner'])
### Import correction:
df_export_cif['Import_Corrected'] = df_export_cif['Import'] / df_export_cif['CIF_Coefficient'].astype('float16')
df_export_cif.drop(['Import', 'CIF_Coefficient'], axis = 1, inplace = True)

In [None]:
### IMF DOTS: SIMPLE COMBINATION (OLD VERSION)

### Combining Export & Import data:
df_export_cif['Export_Augmented'] = df_export_cif['Export'].combine_first(df_export_cif['Import_Corrected'])
df_export_cif.drop('CIF_Coefficient', axis = 1, inplace = True)
df_export_cif = df_export_cif.sort_index()

In [21]:
### IMF DOTS: IMPORT DATA INCORPORATION: RESULTS SAVING (OLD VERSION)

### Dataset saving:
df_export_cif[['Export', 'Export_Augmented']].to_hdf(path_or_buf = str_path_imf_dots_combined, key = str_key_imf_dots_full, mode = 'w', format = 'fixed')
del df_export_cif
gc.collect()

11

In [15]:
### IMF DOTS: DATA AGGREGATION: EXPORT QUALITY RATIOS

gc.collect()

### Defining similarity for exporters by date
def get_exporter_ratio(df_group):
    df_group.fillna(0.0, inplace = True)    
    df_both = df_group.dropna()    
    if (df_both['Export'].sum() > 0.0):
        flo_result = (df_both['Export'] - df_both['Import_Corrected']).abs().clip(upper = df_group['Export'].max()).sum() / df_group['Export'].sum() / len(df_group)
    else:
        flo_result = np.NaN    
    return flo_result

### Similarity values calculation:
ser_exporter_ratio = df_export_cif.groupby(['Date', 'Reporter']).apply(get_exporter_ratio)
ser_exporter_ratio.name = 'Exporter_Ratio'

  # Remove the CWD from sys.path while we load stuff.


In [28]:
### IMF DOTS: DATA AGGREGATION: SIMILARITY TEST

print(round(ser_exporter_ratio.min(), 4), '/', ser_exporter_ratio.idxmin())
print(round(ser_exporter_ratio.max(), 4), '/', ser_exporter_ratio.idxmax())

#display(df_export_cif.loc[('2000-06-30', 'US', All), :].dropna())
#display(df_export_cif.loc[('1960-01-29', 'MA', All), :])

0.0 / (Timestamp('2000-06-30 00:00:00'), 'US')
1.0 / (Timestamp('1960-01-29 00:00:00'), 'MA')


In [29]:
### IMF DOTS: DATA AGGREGATION: IMPORT QUALITY RATIOS

gc.collect()
### Defining similarity for importers by date
def get_importer_ratio(df_group):
#    df_group['Liability_Inverted'].fillna(0.0, inplace = True)
    df_group.fillna(0.0, inplace = True)     
    df_both = df_group.dropna()
    if (df_both['Import_Corrected'].sum() > 0.0):
        flo_result = (df_both['Export'] - df_both['Import_Corrected']).abs().clip(upper = df_group['Import_Corrected'].max()).sum() \
                                                                        / df_group['Import_Corrected'].sum() / len(df_group)
    else:
        flo_result = np.NaN    
    return flo_result

### Similarity values calculation:
ser_importer_ratio = df_export_cif.groupby(['Date', 'Partner']).apply(get_importer_ratio)
ser_importer_ratio.name = 'Importer_Ratio'

  # Remove the CWD from sys.path while we load stuff.


In [35]:
### IMF DOTS: DATA AGGREGATION: SIMILARITY TEST

print(round(ser_importer_ratio.min(), 4), '/', ser_importer_ratio.idxmin())
print(round(ser_importer_ratio.max(), 4), '/', ser_importer_ratio.idxmax())

#display(df_export_cif.loc[('1981-04-30', All, 'KN'), :])
#display(df_export_cif.loc[('1966-09-30', All, 'NG'), :])

0.0 / (Timestamp('1981-04-30 00:00:00'), 'KN')
1.0007 / (Timestamp('1966-09-30 00:00:00'), 'NG')


In [37]:
### IMF DOTS: DATA AGGREGATION: ADDING RATIOS

gc.collect()

df_dots_to_augment = df_export_cif.join(ser_exporter_ratio).join(ser_importer_ratio)
df_dots_to_augment['Export_Augmented'] = np.NaN # -999 # 
df_dots_to_augment = df_dots_to_augment.reorder_levels(['Date', 'Reporter', 'Partner'])

In [None]:
### IMF DOTS: DATA AGGREGATION: CONDITIONAL REPLACING

gc.collect()
def augment_by_date(df_date, int_option = -1):
    '''
       -1 : Replace NaN Asset values unconditionally
        0 : Replace NaN Asset values when Investor's Ratio > Borrower's Ratio
        1 : Replace NaN or zero Asset values when Investor's Ratio > Borrower's Ratio
        2 : Replace any Asset values when Investor's Ratio > Borrower's Ratio
    '''
    if (int_option == -1):
        ### Replacing zero Export & Import values with NaN:
        df_date.loc[df_date['Export'] == 0.0, 'Export'] = np.NaN        
        df_date['Export_Augmented'] = df_date['Export'].combine_first(df_date['Import_Corrected'])
    elif (int_option == 0):
        ### Fill resulting column with not NaN Export values:
        df_date.loc[df_date['Export'].notna(), 'Export_Augmented'] = df_date[df_date['Export'].notna()]['Export'].values
        ### Fill resulting column with Import value if Export value is NaN & Investor Ratio is NaN (and doesn't matter if Borrower Ratio is NaN):
        df_date.loc[df_date['Export'].isna() & df_date['Exporter_Ratio'].isna(), 'Export_Augmented'] = \
            df_date[df_date['Export'].isna() & df_date['Exporter_Ratio'].isna()]['Import_Corrected'].values
        ### Fill resulting column with Import value if Export value is NaN & Investor Ratio is bigger than Borrower Ratio:
        df_date.loc[df_date['Export'].isna() & df_date['Exporter_Ratio'].notna() & df_date['Importer_Ratio'].notna() & \
                    (df_date['Exporter_Ratio'] > df_date['Importer_Ratio']), 'Export_Augmented'] = \
            df_date[df_date['Export'].isna() & df_date['Exporter_Ratio'].notna() & df_date['Importer_Ratio'].notna() & \
                    (df_date['Exporter_Ratio'] > df_date['Importer_Ratio'])]['Import_Corrected'].values
    elif (int_option == 1):
        ### Replacing zero Export values with NaN:
        df_date.loc[df_date['Export'] == 0.0, 'Export'] = np.NaN
        ### Fill resulting column with not NaN Export values:
        df_date.loc[df_date['Export'].notna(), 'Export_Augmented'] = df_date.loc[df_date['Export'].notna(), 'Export'].values
        ### Fill resulting column with Import value if Export value is NaN & Investor Ratio is NaN (and doesn't matter if Borrower Ratio is NaN):
        df_date.loc[df_date['Export'].isna() & df_date['Exporter_Ratio'].isna(), 'Export_Augmented'] = \
            df_date[df_date['Export'].isna() & df_date['Exporter_Ratio'].isna()]['Import_Corrected'].values
        ### Fill resulting column with Import value if Export value is NaN & Investor Ratio is bigger than Borrower Ratio:
        df_date.loc[df_date['Export'].isna() & df_date['Exporter_Ratio'].notna() & df_date['Importer_Ratio'].notna() & \
                    (df_date['Exporter_Ratio'] > df_date['Importer_Ratio']), 'Export_Augmented'] = \
            df_date[df_date['Export'].isna() & df_date['Exporter_Ratio'].notna() & df_date['Importer_Ratio'].notna() & \
                    (df_date['Exporter_Ratio'] > df_date['Importer_Ratio'])]['Import_Corrected'].values
    else:
        ### Replacing zero Export & Import values with NaN:
        df_date.loc[df_date['Export'] == 0.0, 'Export'] = np.NaN        
        df_date.loc[df_date['Import_Corrected'] == 0.0, 'Import_Corrected'] = np.NaN
        ### Ratios preparation:
        df_date.loc[df_date['Exporter_Ratio'].isna(), 'Exporter_Ratio'] = 999.0
        df_date.loc[df_date['Importer_Ratio'].isna(), 'Importer_Ratio'] = 1000.0
        ### Ratios comparision:
        df_date.loc[df_date['Exporter_Ratio'] <= df_date['Importer_Ratio'], 'Export_Augmented'] = \
            df_date[df_date['Exporter_Ratio'] <= df_date['Importer_Ratio']]['Export'].values
        df_date.loc[df_date['Exporter_Ratio'] > df_date['Importer_Ratio'], 'Export_Augmented'] = \
            df_date[df_date['Exporter_Ratio'] > df_date['Importer_Ratio']]['Import_Corrected'].values                                       
    return df_date

dict_dots_augmented = {}
dict_dots_augmented[-1] = df_dots_to_augment.groupby('Date').apply(augment_by_date, -1)
#dict_dots_augmented[0] = df_dots_to_augment.groupby('Date').apply(augment_by_date, 0)
#dict_dots_augmented[1] = df_dots_to_augment.groupby('Date').apply(augment_by_date, 1)
dict_dots_augmented[2] = df_dots_to_augment.groupby('Date').apply(augment_by_date, 2)

In [43]:
### IMF DOTS: TOTAL DATA AGGREGATION: RESULTS CONSOLIDATION TO DATAFRAME AND SAVING

gc.collect()

df_augmentation_way = pd.concat([df_dots_to_augment['Export'].replace({0.0: np.NaN}), 
                                 dict_dots_augmented[-1]['Export_Augmented'], 
                                 dict_dots_augmented[2]['Export_Augmented']], 
                                axis = 1, keys = ['Exports_Only', 'Unconditional', 'Option_2'], names = 'Augmentation_Way')
df_augmentation_way.to_hdf(path_or_buf = str_path_imf_dots_options, key = str_key_do_total_imf_dots_options, mode = 'w', format = 'fixed')