In [1]:
### COUNTRY RISKS SOURCE DATA EXPORTS

In [2]:
### MODULES IMPORT

import pandas as pd
import numpy as np
import openpyxl
from datetime import date, datetime
import math
import os

In [3]:
### GENERAL PARAMETERS

### Constants:
All = slice(None)
### Universe path:
str_path_universe = 'Data_Files/Source_Files/Country_Risks/acadian_universe.xlsx'
### PRS datasources paths:
str_path_prs_full = 'Data_Files/Source_Files/Country_Risks/PRS_full.xlsx'
str_path_prs_pillars_only = 'Data_Files/Source_Files/Country_Risks/PRS_pillars_only.xlsx'
str_path_prs_political_risk_pillar = 'Data_Files/Source_Files/Country_Risks/PRS_political_risk_pillar.xlsx'
### PRS results saving:
str_path_prs_hdf = 'Data_Files/Source_Files/Country_Risks/PRS_loaded.h5'
str_key_prs_full_converted = 'prs_full_converted'
str_key_prs_pillars_only_converted = 'prs_pillars_only_converted'
str_key_prs_political_risk_pillar_converted = 'prs_political_risk_pillar_converted'
### Continuum datasources paths:
str_path_continuum_composite = 'Data_Files/Source_Files/Country_Risks/Continuum_Composite_Indicators.xlsx'
str_path_continuum_gp_pillar = 'Data_Files/Source_Files/Country_Risks/Continuum_Growth_Potential_pillar.xlsx'
str_path_continuum_si_pillar = 'Data_Files/Source_Files/Country_Risks/Continuum_Social_Inclusion_pillar.xlsx'
str_path_continuum_politics = 'Data_Files/Source_Files/Country_Risks/Continuum_Politics.xlsx'
### Continuum results saving:
str_path_continuum_hdf = 'Data_Files/Source_Files/Country_Risks/Continuum_loaded.h5'
str_key_continuum_composite_converted = 'continuum_composite_indicators_converted'
str_key_continuum_gp_pillar_converted = 'continuum_gp_pillar_converted'
str_key_continuum_si_pillar_converted = 'continuum_si_pillar_converted'
str_key_continuum_politics_converted = 'continuum_politics_converted'

In [4]:
### DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE

def ison_membership_converting(str_path_universe, date_end = datetime.today(), bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(io = str_path_universe, sheet_name = 0, header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

In [5]:
### DEFINING DATE/COUNTRY DATA VECTOR DESCRIBER FUNCTION

def date_country_vector_describer(ser_data, ser_ison):
    ### ISON countries set:
    set_ison_countries = set(ser_ison.index.get_level_values(1).unique())
    ### Vector countries set:    
    set_vector_countries = set(ser_data.dropna().index.get_level_values(1).unique())    
    ### Vector completeness:
    print('Data vector name: {}'.format(ser_data.name))
    print('Data vector completeness: {:.2%}'.format(ser_data.count() / len(ser_data.index))) 
    print('ISON countries completeness: {:.2%} ({} / {})'.format(len(set_vector_countries.intersection(set_ison_countries)) / len(set_ison_countries),
                                                                 len(set_vector_countries.intersection(set_ison_countries)),
                                                                 len(set_ison_countries)))
    print('Absent ISON countries: [{}]'.format(str(', '.join(sorted(list(set_ison_countries - set_vector_countries))))))
    ### ISON Universe binding (if needed):
    if not ('Market' in ser_data.index.names):
        ser_data = ser_data.to_frame().join(ser_ison, how = 'left').set_index('Market', append = True).squeeze()\
                                      .loc[All, All, ['DM', 'EM', 'FM']].sort_index(level = ['Date', 'Country'])
    ### Dates for heatmap x-axis labeles:
    list_idx_dates = ser_data.index.get_level_values('Date').unique()
    ### Dates reindexation (adding NaN values for absent observations):
    ser_region_data = ser_data.loc[All, All, ['DM', 'EM', 'FM']].droplevel('Market').unstack('Country').reindex(list_idx_dates).stack('Country', dropna = False)      
    ### Countries number for heatmap height defining:
    int_fig_height = len(ser_region_data.index.get_level_values('Country').unique())    
    ### Adding shade column for future heatmap striping:
    list_countries = list(ser_region_data.index.get_level_values('Country').unique())
    dict_countries = dict(zip(list_countries, map(lambda iter_num: iter_num % 2 + 2, range(len(list_countries)))))
    df_region_shades = ser_region_data.to_frame().assign(Shade = list(map(dict_countries.get, ser_region_data.index.get_level_values('Country'))))
    df_region_shades.columns = ['Data', 'Shade']
    ### Heatmap drawing:
    fig_heatmap = plt.figure(figsize = (15, int_fig_height // 5))
    df_region_data = (df_region_shades['Data'] / df_region_shades['Data'] * df_region_shades['Shade']).unstack('Date').sort_index()
    df_region_data.columns = df_region_data.columns.strftime('%d-%m-%Y')
    ax_heatmap = sns.heatmap(df_region_data, cbar = False, annot = False, cmap = 'binary', xticklabels = 'auto', yticklabels = True, 
                             vmin = 0.0, vmax = 6.0)
    ax_heatmap.set_title('ISON Universe')    
    ### Visualizer heatmap plotting:        
    for str_region_code, ser_region_data in ser_data.groupby('Market'):
        ### Dates reindexation (adding NaN values for absent observations):
        ser_region_data = ser_region_data.droplevel('Market').unstack('Country').reindex(list_idx_dates).stack('Country', dropna = False)   
        ### Countries number for heatmap height defining:        
        int_fig_height = len(ser_region_data.index.get_level_values('Country').unique())
        if (int_fig_height > 5):
            ### Adding shade column for future heatmap striping:
            list_countries = list(ser_region_data.index.get_level_values('Country').unique())
            dict_countries = dict(zip(list_countries, map(lambda iter_num: iter_num % 2 + 2, range(len(list_countries)))))
            df_region_shades = ser_region_data.to_frame().assign(Shade = list(map(dict_countries.get, ser_region_data.index.get_level_values('Country'))))
            df_region_shades.columns = ['Data', 'Shade']
            ### Heatmap drawing:
            fig_heatmap = plt.figure(figsize = (15, int_fig_height // 5))
            df_region_data = (df_region_shades['Data'] / df_region_shades['Data'] * df_region_shades['Shade']).unstack('Date').sort_index()
            df_region_data.columns = df_region_data.columns.strftime('%d-%m-%Y')
            ax_heatmap = sns.heatmap(df_region_data, cbar = False, annot = False, cmap = 'binary', xticklabels = 'auto', yticklabels = True, 
                                     vmin = 0.0, vmax = 6.0)
            ax_heatmap.set_title(str_region_code)
        else:
            print('Too few countries to show heatmap for', str_region_code, '(', int_fig_height, ')')
    ### Plots showing:
    plt.show()

In [6]:
### DEFINING EXTRACTION ISO COUNTRY CODES

def get_country_codes(use_local_copy = False):  
    ### Importing standard modules and date-special modules:    
    import pandas as pd
    ### Choosing local copy or direct link:
    if (use_local_copy):
        url_country_code = 'Data_Files/Source_Files/Country_Risks/countrycode.html'
    else:
        url_country_code = 'https://countrycode.org/'
    ### Loading data:
    df_full_codes = pd.read_html(url_country_code, index_col = 'COUNTRY')[0]
    ### Dividing ISO codes:
    df_full_codes[['ISO SHORT', 'ISO LONG']] = df_full_codes['ISO CODES'].str.split(' / ', expand = True)
    ### Codes polishing:
    df_result_codes = df_full_codes[['ISO SHORT', 'ISO LONG']]      
    df_result_codes.index = df_result_codes.index.str.upper()
    
    return df_result_codes

In [7]:
### DEFINING LOADING PRS DATA FROM MS EXCEL FILE

def get_prs(str_path_prs, bool_convert = False, list_countries = All):
    ### Countries to rename:
    dict_change_country_names = {}
    dict_change_country_names['Côte d’Ivoire'] = 'Ivory Coast'
    dict_change_country_names['Korea, South'] = 'South Korea'
    dict_change_country_names['UAE'] = 'United Arab Emirates'
    ### Countries to unite data vectors:
    dict_unite_country_data = {}
    dict_unite_country_data['Germany, West'] = 'Germany'
    dict_unite_country_data['Serbia-Montenegro'] = 'Serbia'    
    ### Source loading:
    df_prs_source = pd.read_excel(io = str_path_prs, header = 0, parse_dates = True, index_col = [0, 1], engine = 'openpyxl', 
                                  na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                               '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Dates stacking:
    ser_prs_source = df_prs_source.stack(dropna = False).reorder_levels([1, 2, 0])
    ### Datasource naming:    
    ser_prs_source.index.names = ['Variable', 'Date', 'Country']
    ser_prs_source.name = 'PRS'
    ### Type converting
    ser_prs_source = ser_prs_source.round(2)
    ### Non-table rows killing:
    ser_prs_source = ser_prs_source.loc[ser_prs_source.index.get_level_values('Country').str.len() < 20]    
    ### Sequence letter killing:
    ser_prs_source.index.set_levels(ser_prs_source.index.levels[0].str.partition(' (').levels[0], level = 'Variable', inplace = True)
    ### Research mode convertion:
    if bool_convert:       
        ### Countries renaming:
        ser_prs_source = ser_prs_source.reset_index('Country').replace(dict_change_country_names).set_index('Country', append = True).squeeze().sort_index()
        ### Countries data uniting:
        for iter_name in dict_unite_country_data:
            ser_prs_source.loc[All, All, dict_unite_country_data[iter_name]] = ser_prs_source.loc[All, All, dict_unite_country_data[iter_name]]\
                                                                               .combine_first(ser_prs_source.loc[All, All, iter_name])
            ser_prs_source = ser_prs_source.drop(labels = iter_name, level = 'Country')        
        ### Month to dates converting and indexing:
        ser_prs_source.index.set_levels(pd.to_datetime(ser_prs_source.index.levels[1]), level = 'Date', inplace = True)
        ### Dates resampling to business-month-ends:
        ser_prs_source = ser_prs_source.groupby(['Variable', 'Country']).apply(lambda ser_grouped: ser_grouped.droplevel(['Variable', 'Country']).resample('BM').last())
        ### Uppercase for countries to further interaction with ISON codes vector:
        ser_prs_source.index.set_levels(ser_prs_source.index.levels[1].str.upper(), level = 'Country', inplace = True)
        ### ISO country codes loading:
        df_iso_country = get_country_codes()  
        ### Replacing country names with country ISO codes:
        ser_prs_source = ser_prs_source.reset_index('Country').replace(dict(zip(df_iso_country.index, df_iso_country['ISO SHORT']))).set_index('Country', append = True)\
                                       .squeeze()   
        ### Forward filling:
        ser_prs_source = ser_prs_source.groupby(['Country', 'Variable']).ffill()
    ### Results filtering:
    ser_prs_res = ser_prs_source.loc[All, All, list_countries].sort_index()
    ### Results output:
    return ser_prs_res

In [8]:
### DEFINING LOADING CONTINUUM DATA FROM MS EXCEL FILE

def get_continuum(str_path_cont, bool_convert = False, list_countries = All):
    ### Source loading:
    df_cont_source = pd.read_excel(io = str_path_cont, header = 0, parse_dates = True, index_col = list(range(15)), engine = 'openpyxl', 
                                   na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                                '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Removing blank lines:
    df_cont_source = df_cont_source.dropna(how = 'all')
    ### Dates stacking:
    ser_cont_source = df_cont_source.stack(dropna = False).reorder_levels(list(range(1, 15)) + [15, 0])
    ### Datasource naming:    
    ser_cont_source.index.names = ser_cont_source.index.names[: -2] + ['Date', 'Country']
    ser_cont_source.name = 'Continuum'
    ### Non-table rows killing:
    ser_cont_source = ser_cont_source.loc[ser_cont_source.index.get_level_values('Country').str.len() == 2]
    ### Pillar numbers killing ('Indicator Name'):
    ser_cont_source.index.set_levels(ser_cont_source.index.levels[4].str.partition(' (').levels[0], level = 'Indicator Name', inplace = True)
    ### Replacing spaces for further hdf format using:
    ser_cont_source.index.names = [iter_name.replace(' - ', '_').replace(' ', '_') for iter_name in ser_cont_source.index.names]    
    ### Research mode convertion:
    if bool_convert:           
        ### Drop extra indices:
        ser_cont_source.reset_index(['Country_Name', 'Country_RegionId', 'Country_ISO_Country_code', 'Indicator', 'Indicator_ParentName', 'Indicator_Type',
                                     'Indicator_Description', 'Indicator_Unit', 'Indicator_Source', 'Indicator_Referenced_Data', 'Indicator_Date_Created', 
                                     'Scale', 'Units'], drop = True, inplace = True)
        ### Key index renaming:
        ser_cont_source.index.names = ['Indicator'] + ser_cont_source.index.names[1 : ]
        ### Converting quarter names to date:
        ser_cont_source.index.set_levels(pd.to_datetime(ser_cont_source.index.levels[1]), level = 'Date', inplace = True)
        ### Dates resampling to business-month-ends:
        ser_cont_source = ser_cont_source.groupby(['Indicator', 'Country']).apply(lambda ser_ind_country: ser_ind_country.droplevel(['Indicator', 'Country'])\
                                                                           .resample('QS').last().resample('BQ').last()).swaplevel('Country', 'Date')
        ### Forward filling:
        ser_cont_source = ser_cont_source.groupby(['Country', 'Indicator']).ffill()
    ### Results filtering:
    ser_cont_res = ser_cont_source.loc[All, All, list_countries].sort_index()        
    ### Results output:
    return ser_cont_res      

In [9]:
### PRS & CONTINUUM DATA LOADING

ser_ison = ison_membership_converting(str_path_universe)
list_ison_countries = sorted(ser_ison.index.levels[1].unique())

ser_prs_pillars_only_converted = get_prs(str_path_prs_pillars_only, True, list_ison_countries)
ser_prs_political_risk_pillar_converted = get_prs(str_path_prs_political_risk_pillar, True, list_ison_countries)
ser_prs_full_converted = get_prs(str_path_prs_full, True, list_ison_countries)
ser_continuum_composite_converted = get_continuum(str_path_continuum_composite, True, list_ison_countries)
ser_continuum_gp_pillar_converted = get_continuum(str_path_continuum_gp_pillar, True, list_ison_countries)
ser_continuum_si_pillar_converted = get_continuum(str_path_continuum_si_pillar, True, list_ison_countries)
ser_continuum_politics_converted = get_continuum(str_path_continuum_politics, True, list_ison_countries)

In [11]:
### RESULTS SAVING

### Removing old hdf files before imported data saving:
if (os.path.exists(str_path_prs_hdf)):
    os.remove(str_path_prs_hdf)
if (os.path.exists(str_path_continuum_hdf)):
    os.remove(str_path_continuum_hdf)

### Data vectors saving to hdf format:
ser_prs_pillars_only_converted.to_hdf(str_path_prs_hdf, key = str_key_prs_pillars_only_converted, mode = 'a')
ser_prs_political_risk_pillar_converted.to_hdf(str_path_prs_hdf, key = str_key_prs_political_risk_pillar_converted, mode = 'a')
ser_prs_full_converted.to_hdf(str_path_prs_hdf, key = str_key_prs_full_converted, mode = 'a')
ser_continuum_composite_converted.to_hdf(str_path_continuum_hdf, key = str_key_continuum_composite_converted, mode = 'a')
ser_continuum_gp_pillar_converted.to_hdf(str_path_continuum_hdf, key = str_key_continuum_gp_pillar_converted, mode = 'a')
ser_continuum_si_pillar_converted.to_hdf(str_path_continuum_hdf, key = str_key_continuum_si_pillar_converted, mode = 'a')
ser_continuum_politics_converted.to_hdf(str_path_continuum_hdf, key = str_key_continuum_politics_converted, mode = 'a')