In [1]:
### EER FACTORS CREATING

In [2]:
### INITIALIZATION

import pandas as pd
import numpy as np
from datetime import date, datetime
import statsmodels.api as sm
from scipy import stats as ss
import math     
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
    
%load_ext line_profiler

In [3]:
### GENERAL DATA PREPARATION

### Constants:
All = slice(None)
### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### Bloomberg structured data extraction parameters:
str_path_bb_hdf = 'Data_Files/Source_Files/Bloomberg_prepared.h5'
str_key_ret_daily = 'bb_ret_daily'
str_key_ret_monthly = 'bb_ret_monthly'
str_key_mmr = 'bb_mmr'
str_key_fx_country = 'bb_fx_country'
str_key_fx_demeaned = 'bb_fx_demeaned'
str_key_fx_currency = 'bb_fx_currency'
str_key_mcap = 'bb_mcap'
str_key_reer = 'bb_reer'
str_key_neer = 'bb_neer'
str_key_reer_sourced = 'bb_reer_sourced'
str_key_neer_sourced = 'bb_neer_sourced'
str_key_xcra = 'bb_xcra'
### NEER usage scheme:
bool_neer_raw = False
### Standartization parameters:
flo_elem_similarity = 5 * (10 ** (-8)) ### THA mean ones excluding boundary
flo_tha_ratio = 0.9 ### THA progression ratio
int_tha_length = 24 ### THA horizon length
list_truncate = [2.5, 2.0] # Standartization boundaries
bool_within_market = True # Standartization way
### Factors parameters:
str_measure_date_start = '1996-08-01' # Start date for efficacy measures
str_measure_date_end = '2020-08-31' # End date for efficacy measures
idx_measure_date_range = pd.date_range(str_measure_date_start, str_measure_date_end, freq = 'BM')
str_source_date_start = '1992-01-01' # Start date for source vectors
idx_source_date_range = pd.date_range(str_source_date_start, str_measure_date_end, freq = 'B')
list_ison = ['DM', 'EM', 'FM']
list_filter = ['DM', 'EM', 'FM']
list_countries_to_exclude = ['VE'] # Countries not to play the game
flo_returns_similarity = 0.0025 # Selecting countries with currencies bound to the USD
flo_returns_completeness = 1 / 3
int_concept_lag = 3 ### Lag in months for GDP like concepts, months
int_concept_divider = 1000 # Divider to equalize concepts and GDP scales
int_concept_min = 0.0 # Minimal value to compare with log(1 + EXPORT/GDP)
int_concept_max = 0.3 # Maximal value to compare with log(1 + EXPORT/GDP)
int_eer_fill_limit = 260 * 50 # Days for forward fill NEER and REER inside country vectors ### For product version we need value = 66, days
int_regress_win = 60 # Regression window length for alternative sensitivity concept
int_regress_hl = 3 # Half-life period for alternative sensitivity concept, months
int_factor_addendum = 2.5 # list_truncate[0] # Factor scaler
dict_numer_ma_win = {} # Moving average window length for factors numerators, days
dict_numer_ma_win['LONG_TERM'] = round(260 / 12 * 3)
dict_numer_ma_win['SHORT_TERM'] = round(260 / 12 / 2)
dict_denom_ma_win = {} # Moving average window length for factors denomenators, days
dict_denom_ma_win['LONG_TERM'] = round(260 * 5)
dict_denom_ma_win['SHORT_TERM'] = round(260 / 12 * 6)
dict_numer_ma_min = {} # Moving average minimal notna count for factors numerators, days
dict_numer_ma_min['LONG_TERM'] = round(1)
dict_numer_ma_min['SHORT_TERM'] = round(1)
dict_denom_ma_min = {} # Moving average minimal notna count for factors denominators, days
dict_denom_ma_min['LONG_TERM'] = dict_denom_ma_win['LONG_TERM'] // 2
dict_denom_ma_min['SHORT_TERM'] = dict_denom_ma_win['SHORT_TERM'] // 2
int_mom_length = 5 # Years of momentum vector
dict_mom_min = {} # minimal values number for momentum factor calculation, days:
dict_mom_min['LONG_TERM'] = int(260 * 2.5)
dict_mom_min['SHORT_TERM'] = 260 // 4
dict_mom_hl = {} # Half-life period for momentum factor, months:
dict_mom_hl['LONG_TERM'] = 24
dict_mom_hl['SHORT_TERM'] = 3
### Factors options:
dict_combinations = {}
dict_combinations['LONG_TERM_EER'] = ('LONG_TERM', 'MOMENTUM', 'EXP_GDP_rate', 'REER', 'HEDGED')
dict_combinations['SHORT_TERM_MIXED'] = ('SHORT_TERM', 'MOMENTUM', 'EXP_GDP_rate', 'NEER', 'HEDGED')
dict_combinations['LONG_TERM_EXPORT'] = ('LONG_TERM', 'MOMENTUM', 'EXP_GDP_rate', 'EXPORT', 'HEDGED')
dict_combinations['COMBO_DOUBLE'] = ('DOUBLE', 'COMBO', 'COMBO', 'COMBO', 'COMBO')
dict_combinations['COMBO_TRIPLE'] = ('TRIPLE', 'COMBO', 'COMBO', 'COMBO', 'COMBO')
### Factor averaging weights:
dict_factors_weights = {}
dict_factors_weights['LONG_TERM_EER'] = 1.0
dict_factors_weights['SHORT_TERM_MIXED'] = 1.0
dict_factors_weights['LONG_TERM_EXPORT'] = 0.75 # 1.0
### Factors signs:
dict_factors_signs = {}
dict_factors_signs['LONG_TERM_EER'] = -1.0
dict_factors_signs['SHORT_TERM_MIXED'] = -1.0
dict_factors_signs['LONG_TERM_EXPORT'] = 1.0
### FX Factor parameters:
int_short_diff = 21 * 3
list_extreme_boundaries = [-0.5, 2.0]
### Work periods:
ser_work_periods = pd.Series(1 , index = pd.MultiIndex.from_product([['Year', 'Month'], ['Y', 'M', 'D']], names = ['Period', 'Frequency']))
ser_work_periods['Year', 'M'] = 12
ser_work_periods['Year', 'D'] = 260
ser_work_periods['Month', 'Y'] = 0
ser_work_periods['Month', 'D'] = 22
flo_exp_weight_month = ser_work_periods['Year', 'D'] / ser_work_periods['Year', 'M']
### Transitional results parameters:
str_path_trans_hdf = 'Data_Files/Test_Files/EER_factors_transitional.h5'
str_key_trans_ret = 'trans_ret'
str_key_trans_mcap = 'trans_mcap'
str_key_trans_factor = 'trans_factor'
### Measures parameters:
list_measure = ['fmb_weighted', 'qtl4'] # Efficacy measures list
list_back_period = [99, 10, 5] # Look back periods
int_horizon = 12 # Measure stats horizon
### 
str_path_efficacy_hdf = 'Data_Files/Test_Files/EER_factors_stats.h5'
str_path_vectors_hdf = 'Data_Files/Test_Files/EER_factors_vectors.h5'
str_key_efficacy = 'fmb_weight'
str_path_efficacy_xlsx = 'Data_Files/Test_Files/EER_factors_stats.xlsx'
str_path_vectors_xlsx = 'Data_Files/Test_Files/EER_factors_vectors.xlsx'
str_path_factors_xlsx = 'Data_Files/Test_Files/EER_factors_source.xlsx'

In [4]:
### DEFINING DATE/COUNTRY DATA VECTOR DESCRIBER FUNCTION

def date_country_vector_describer(ser_data, ser_ison):
    ### ISON countries set:
    set_ison_countries = set(ser_ison.index.get_level_values(1).unique())
    ### Vector countries set:    
    set_vector_countries = set(ser_data.dropna().index.get_level_values(1).unique())    
    ### Vector completeness:
    print('Data vector name: {}'.format(ser_data.name))
    print('Data vector completeness: {:.2%}'.format(ser_data.count() / len(ser_data.index))) 
    print('ISON countries completeness: {:.2%} ({} / {})'.format(len(set_vector_countries.intersection(set_ison_countries)) / len(set_ison_countries),
                                                                 len(set_vector_countries.intersection(set_ison_countries)),
                                                                 len(set_ison_countries)))
    print('Absent ISON countries: [{}]'.format(str(', '.join(sorted(list(set_ison_countries - set_vector_countries))))))
    ### ISON Universe binding (if needed):
    if not ('Market' in ser_data.index.names):
        ser_data = ser_data.to_frame().join(ser_ison, how = 'left').set_index('Market', append = True).squeeze()\
                                      .loc[All, All, ['DM', 'EM', 'FM']].sort_index(level = ['Date', 'Country'])
    ### Dates for heatmap x-axis labeles:
    list_idx_dates = ser_data.index.get_level_values('Date').unique()
    ### Dates reindexation (adding NaN values for absent observations):
    ser_region_data = ser_data.loc[All, All, ['DM', 'EM', 'FM']].droplevel('Market').unstack('Country').reindex(list_idx_dates).stack('Country', dropna = False)      
    ### Countries number for heatmap height defining:
    int_fig_height = len(ser_region_data.index.get_level_values('Country').unique())    
    ### Adding shade column for future heatmap striping:
    list_countries = list(ser_region_data.index.get_level_values('Country').unique())
    dict_countries = dict(zip(list_countries, map(lambda iter_num: iter_num % 2 + 2, range(len(list_countries)))))
    df_region_shades = ser_region_data.to_frame().assign(Shade = list(map(dict_countries.get, ser_region_data.index.get_level_values('Country'))))
    df_region_shades.columns = ['Data', 'Shade']
    ### Heatmap drawing:
    fig_heatmap = plt.figure(figsize = (15, int_fig_height // 5))
    df_region_data = (df_region_shades['Data'] / df_region_shades['Data'] * df_region_shades['Shade']).unstack('Date').sort_index()
    df_region_data.columns = df_region_data.columns.strftime('%d-%m-%Y')
    ax_heatmap = sns.heatmap(df_region_data, cbar = False, annot = False, cmap = 'binary', xticklabels = 'auto', yticklabels = True, 
                             vmin = 0.0, vmax = 6.0)
    ax_heatmap.set_title('ISON Universe')    
    ### Visualizer heatmap plotting:        
    for str_region_code, ser_region_data in ser_data.groupby('Market'):
        ### Dates reindexation (adding NaN values for absent observations):
        ser_region_data = ser_region_data.droplevel('Market').unstack('Country').reindex(list_idx_dates).stack('Country', dropna = False)   
        ### Countries number for heatmap height defining:        
        int_fig_height = len(ser_region_data.index.get_level_values('Country').unique())
        ### Adding shade column for future heatmap striping:
        list_countries = list(ser_region_data.index.get_level_values('Country').unique())
        dict_countries = dict(zip(list_countries, map(lambda iter_num: iter_num % 2 + 2, range(len(list_countries)))))
        df_region_shades = ser_region_data.to_frame().assign(Shade = list(map(dict_countries.get, ser_region_data.index.get_level_values('Country'))))
        df_region_shades.columns = ['Data', 'Shade']
        ### Heatmap drawing:
        fig_heatmap = plt.figure(figsize = (15, int_fig_height // 5))
        df_region_data = (df_region_shades['Data'] / df_region_shades['Data'] * df_region_shades['Shade']).unstack('Date').sort_index()
        df_region_data.columns = df_region_data.columns.strftime('%d-%m-%Y')
        ax_heatmap = sns.heatmap(df_region_data, cbar = False, annot = False, cmap = 'binary', xticklabels = 'auto', yticklabels = True, 
                                 vmin = 0.0, vmax = 6.0)
        ax_heatmap.set_title(str_region_code)
    ### Plots showing:
    plt.show()

In [5]:
### DEFINING MEAN MOMENTUM FUNCTION (TO CALCULATE FACTOR ONLY FOR MONTHENDS):

def rolling_cond_mean_momentum(ser_country_matrix, ser_full_source, int_numer_win, int_numer_min, int_denom_win, int_denom_min):
    ### Country saving:
    str_country = ser_country_matrix.index[0][1]
    ### Checking for country presence in source vector:
    if (str_country in ser_full_source.index.get_level_values(1)):
        ### Filtering country vector from source:
        ser_country_source = ser_full_source.loc[All, str_country]
        ### Looping over matrix index dates:
        for iter_bm_date in ser_country_matrix.index.get_level_values(0):
            try:
                ### Defining monthend date number in source country vector:
                int_idx_num = ser_country_source.index.get_loc(iter_bm_date)
                ### Creating vectors for numerator and denominator means calculation:
                ser_rolled_numer = -ser_country_source.iloc[max((int_idx_num - int_numer_win + 1), 0) : int_idx_num + 1]        
                ser_rolled_denom = -ser_country_source.iloc[max((int_idx_num - int_denom_win + 1), 0) : int_idx_num + 1]
                ### Checking for minimal data presence:
                if ((ser_rolled_numer.count() >= int_numer_min) & (ser_rolled_denom.count() >= int_denom_min)):
                    ### Mena momentum value calculation:
                    ser_country_matrix.loc[iter_bm_date, str_country] = np.log(ser_rolled_numer.mean() / ser_rolled_denom.mean())
            except KeyError:
                pass
    ### Resulting vector output:
    return ser_country_matrix

In [6]:
### DEFINING EXPONENTIAL WEIGHT

def exp_weight_single(halflife_len = 3, num_element = 0):
    ### Weight calculating:
    num_period_factor = math.exp(math.log(0.5) / round(halflife_len))
    num_weight = np.exp(math.log(num_period_factor) * num_element)
    ### Result output:
    return num_weight

In [7]:
### DEFINING GEOMETRICAL WEIGHT

def geom_weight_single(flo_ratio, flo_factor = 1, num_element = 0):
    ### Results output:
    return flo_factor * (flo_ratio ** num_element)

In [8]:
### DEFINING WEIGHTED AVERAGE

def weighted_average(ser_data, ser_weight = False, int_min_count = 0):
    ### Default output:
    num_result = np.NaN
    ### Checking for data presence:
    if (ser_data.count() > int_min_count):       
        ### Checking for weights dataset:
        if isinstance(ser_weight, bool):
            ### Calculating of simple average:
            num_result = np.nanmean(ser_data.values)
        else:
            ### Weights filtering:
            list_weight = ser_weight[ser_data.dropna().index].values
            ### Checking for weights presence:
            if np.nansum(list_weight):
                ### Data filtering:
                list_data = ser_data.dropna().values
                ### Weighted average calculating:
                num_result = np.nansum(list_data * list_weight) / np.nansum(list_weight)
    ### Results output:
    return num_result

In [9]:
### DEFINING MEAN MOMENTUM FUNCTION (TO CALCULATE FACTOR ONLY FOR MONTHENDS)

def rolling_cond_weighted_mean(ser_country_matrix, ser_full_source, int_mean_win, int_mean_min, list_weight = False, ser_full_cond = False):
    ### Defining conditional average calculator:
    def conditional_average(ser_source, list_weight, int_min_count = 0, ser_condition = False):
        ### Weight setting
        ser_weight = pd.Series(list_weight[ : len(ser_source.index)], ser_source.index)
        ### If we have condition we should resort the weight array:
        if not isinstance(ser_condition, bool):
            ser_condition_sorted = pd.Series(ser_condition.sort_values().index, ser_condition.index)
            ser_condition_sorted.name = 'Condition'
            ser_weight = pd.concat([ser_weight, ser_condition_sorted], axis = 1).reset_index(drop = True).set_index('Condition').squeeze().sort_index()            
        ### Results output:
        return weighted_average(ser_source, ser_weight, int_min_count)    
    ### Country saving:
    str_country = ser_country_matrix.index[0][1]
    ### Checking for country presence in source vector:
    if (str_country in ser_full_source.index.get_level_values(1)):
        ### Filtering country vector from source:
        ser_country_source = ser_full_source.loc[All, str_country]
        if not isinstance(ser_full_cond, bool):
            ser_country_cond = ser_full_cond.loc[All, str_country]
        ### Looping over matrix index dates:
        for iter_bm_date in ser_country_matrix.index.get_level_values(0):
            try:
                ### Defining monthend date number in source country vector:
                int_idx_num = ser_country_source.index.get_loc(iter_bm_date)
                ### Creating vectors for numerator and denominator means calculation:
                ser_rolled_source = ser_country_source.iloc[max((int_idx_num - int_mean_win + 1), 0) : int_idx_num + 1]
                if not isinstance(ser_full_cond, bool):
                    ser_rolled_cond = ser_country_cond.loc[ser_rolled_source.index]
                else:
                    ser_rolled_cond = False
                ### Action for MatLab compatibility:
                ser_rolled_source.iloc[0] = np.NaN
                ### Simple mean calculation:
                if isinstance(list_weight, bool):
                    ser_country_matrix.loc[iter_bm_date, str_country] = weighted_average(ser_rolled_source, False, int_mean_min)
                else:
                    ### Weighted mean calculation:
                    ser_country_matrix.loc[iter_bm_date, str_country] = conditional_average(ser_rolled_source, list_weight, int_mean_min, ser_rolled_cond)
            except KeyError:
                pass
    ### Resulting vector output:
    return ser_country_matrix

In [10]:
### DEFINING MULTI-STEP STANDARTIZATION FOR SEPARATE SERIES

def multistep_standartize(ser_data_source, arr_truncate, ser_weight = False, reuse_outliers = False, center_result = True, full_result = False):  
    ### Arrays of iterations properties:
    arr_mean = []
    arr_std = []
    ### Adding equal weights, when weights are absent:
    if isinstance(ser_weight, bool):
        ser_weight = pd.Series(1, index = ser_data_source.index)
        ser_weight.name = 'Weight'    
    ### Workhorse and resulting data vectors initialising:
    ser_data_iter = ser_data_source.dropna()
    ser_weight_iter = ser_weight.copy()
    ser_data_full = pd.Series(np.NaN, index = ser_data_iter.index)
    ### Looping by boundaries array:
    for num_bound_iter in arr_truncate:
        ### Properties calculating and saving:
        num_mean_iter = weighted_average(ser_data_iter, ser_weight_iter)
        num_std_iter = ser_data_iter.std()
        arr_mean.append(num_mean_iter)
        arr_std.append(num_std_iter)
        ser_data_iter = (ser_data_iter - num_mean_iter) / num_std_iter       
        ### Standartizing:
        if reuse_outliers:
            ser_data_iter[ser_data_iter.abs() >= num_bound_iter] = np.sign(ser_data_iter) * num_bound_iter 
        else:
            ### Saving to result and excluding from further calculations truncated values:             
            ser_data_full.where(ser_data_iter.abs() < num_bound_iter, np.sign(ser_data_iter) * num_bound_iter, inplace = True)
            ser_data_iter = ser_data_iter[ser_data_iter.abs() < num_bound_iter]           
    ### Aggregating result:
    if (reuse_outliers):
        ser_data_full = ser_data_iter
    else:     
        ser_data_full[ser_data_iter.index] = ser_data_iter
    ### Centering result:
    if (center_result):
        ser_result = ser_data_full - weighted_average(ser_data_full, ser_weight) 
    else:
        ser_result = ser_data_full    
    ### Result output:
    ser_result.name = str(ser_data_source.name) + '_standartized'
    if (full_result):
        return (ser_result, arr_mean, arr_std)
    else:
        return ser_result

In [11]:
### DEFINING MULTI-STEP STANDARTIZATION BY MARKET FOR CROSS-SECTION

def ison_standartize(ser_to_manage, arr_truncate, ser_weight = False, reuse_outliers = False, center_result = True, full_result = False, within_market = False):
    ### Multi-step standartizing:
    if (within_market):
    ### Within market standartizing:
        ser_result = ser_to_manage.groupby(by = 'Market', group_keys = False).apply(multistep_standartize, arr_truncate, ser_weight, 
                                                                                                  reuse_outliers, center_result, full_result)
    else:
    ### Full universe standartizing:
        ser_result = multistep_standartize(ser_to_manage, arr_truncate, ser_weight, reuse_outliers, center_result, full_result)
    ### Results output:
    return ser_result

In [12]:
### DEFINING MULTI-STEP THA STANDARTIZATION BY MARKET FOR CROSS-SECTION

def tha_standartize(ser_to_manage, arr_truncate, ser_weight = False, reuse_outliers = False, center_result = True, full_result = False):
    ### Multi-step standartizing:
    (ser_reversed, list_mean, list_std) = multistep_standartize(ser_to_manage, arr_truncate, ser_weight, reuse_outliers, center_result, full_result = True)
    for iter_num in range(len(arr_truncate))[::-1]:
        ser_reversed = (ser_reversed * list_std[iter_num] + list_mean[iter_num])
    ser_demeaned = ser_reversed.groupby('Market').apply(lambda ser_region: ser_region - ser_region.mean())
    ser_stand_z = multistep_standartize(ser_demeaned, arr_truncate, ser_weight, reuse_outliers, center_result, full_result = False)
    ### Results output:
    return ser_stand_z

In [13]:
### DEFINING UNIVERSAL AUTOCORRELATION FOR DATE-COUNTRY-UNIVERSE SERIES

def vector_autocorr(ser_source, int_shift):
    ### Defining adding full universe for each date:
    def universe_reindex(iter_group, idx_universe):
        df_iter_result = iter_group.unstack('Date').reindex(idx_universe).sort_index().stack('Date', dropna = False)
        ### Results output:
        return df_iter_result   
    ### Defining adding full date range for each country and date index shifting:
    def date_reindex(iter_group, idx_date_range, num_shift = 0):
        ser_iter_result = iter_group.unstack('Country').reindex(idx_date_range).sort_index().shift(-int_shift).stack('Country', dropna = False).squeeze()
        ### Results output:
        return ser_iter_result   
    ### Defining by date correrlation function:
    def corr_by_date(iter_group):
        num_iter_corr = iter_group['Corr_factor_minus'].corr(iter_group['Corr_factor_plus'])
        ### Results output:
        return num_iter_corr       
    ### Preparing expanded universe for autocorrelation performing:
    idx_date_range = ser_source.index.get_level_values(0).unique()
    idx_universe = ser_source.index.get_level_values(1).unique()
    ser_source_full = ser_source.to_frame().reset_index('Market', drop = True).groupby('Date', group_keys = False).apply(universe_reindex, idx_universe)\
                                .swaplevel().squeeze()
    ### Autocorrelation preparing:
    ser_source_plus = ser_source_full.groupby('Country', group_keys = False).apply(lambda iter_group: iter_group.iloc[1 : ])\
                                     .sort_index(level = ['Date', 'Country'])
    ser_source_minus = ser_source_full.groupby('Country', group_keys = False).apply(lambda iter_group: iter_group.iloc[: -1])\
                                      .sort_index(level = ['Date', 'Country'])
    ### Artificial series combining for indexes synchronization:        
    ser_source_plus_shifted = ser_source_plus.groupby('Country', group_keys = False).apply(date_reindex, idx_date_range, int_shift)
    df_to_corr = pd.concat([ser_source_minus, ser_source_plus_shifted], axis = 1)
    df_to_corr.columns = ['Corr_factor_minus', 'Corr_factor_plus']
    ser_autocorr_vector = df_to_corr.groupby('Date').apply(corr_by_date).shift(int_shift)
    ### Results output:
    return ser_autocorr_vector

In [14]:
# DEFINING MULTI-STEP STANDARTIZATION BY MARKET FOR FULL FACTOR STACK

def single_factor_standartize(ser_factor, arr_truncate, ser_weight = False, reuse_outliers = False, center_result = True, within_market = False, 
                              flag_tha = False, flo_similarity = 5 * (10 ** (-8))):
    ### Local constants:
    dict_tha_pow = {}
    dict_tha_pow['monthly'] = 1
    dict_tha_pow['quarterly'] = 1 / 3
    dict_tha_pow['annual'] = 1 / 12
    ### Weights preparing:
    if isinstance(ser_weight, bool):
        ser_weight = pd.Series(1, index = ser_factor.index)
        ser_weight.name = 'Weight'
    ### Multi-step standartizing:        
    df_factor = ser_factor.to_frame().join(ser_weight, how = 'left')
    df_factor.columns = ['Factor', 'Weight']
    ### Time-horizon adjusted standartization:
    if (flag_tha):
        ### Z-scored vector calculating:       
        ser_stand_z = df_factor.groupby('Date', group_keys = False)\
                               .apply(lambda iter_df: tha_standartize(iter_df['Factor'], arr_truncate, iter_df['Weight'], reuse_outliers, center_result, False))
        ### Results output:
        ser_stand_z.name = ser_factor.name
        ### Autocorrelation vector calculating:
        ser_autocorr_vector = ser_stand_z.groupby('Market').apply(vector_autocorr, 1)
        ser_autocorr_vector.name = 'Autocorr'
        ser_autocorr_cum_mean = ser_autocorr_vector.loc[np.abs(ser_autocorr_vector - 1) > flo_similarity].groupby('Market', group_keys = False).expanding().mean()
        ### THA-coeficcient calculating:
        ser_tha_coeff = ser_autocorr_cum_mean.transform(lambda iter_mean: max(iter_mean, 0.0) ** dict_tha_pow[flag_tha])
        ser_tha_coeff = ser_tha_coeff.transform(lambda iter_mean: 
                                                sum(map(lambda iter_num: geom_weight_single(flo_tha_ratio * iter_mean, 1, iter_num), range(int_tha_length))) / 2)
        ser_tha_coeff = ser_tha_coeff.swaplevel()
        ser_tha_coeff = ser_tha_coeff.unstack('Market').reindex(ser_stand_z.index.levels[0]).stack('Market', dropna = False).sort_index(level = ['Date', 'Market'])        
        ### THA-adjusted z-score calculating:
#        ser_stand_s = (ser_stand_z * ser_tha_coeff)
        ### Artifical filling values for first date of region appearance (not to loose observations):
        ser_stand_s = (ser_stand_z * ser_tha_coeff.fillna(0.5))        
        ser_stand_s = ser_stand_s[ser_stand_s.index.dropna()].reorder_levels(['Date', 'Country', 'Market']).sort_index()
        ### Standart deviation for THA-adjusted z-score calculating:
        ser_region_std = ser_stand_s.groupby(['Date', 'Market']).std()
        ser_universe_std = ser_stand_s.groupby(['Date']).std()
        ser_universe_std = pd.concat([ser_universe_std], keys = ['Overall'], names = ['Market']).swaplevel()
        ser_std = pd.concat([ser_region_std, ser_universe_std], axis = 0).sort_index()
        ### Results output:
        return (ser_stand_s, ser_stand_z, ser_autocorr_vector, ser_tha_coeff, ser_std)
    ### Simple standartization:    
    else:    
        ser_result = df_factor.groupby('Date', group_keys = False)\
                     .apply(lambda iter_df: ison_standartize(iter_df['Factor'], arr_truncate, iter_df['Weight'], reuse_outliers, center_result, False, within_market))
        ### Results output:
        ser_result.name = ser_factor.name
        return ser_result   

In [15]:
### DEFINING GROUP MULTI-STEP STANDARTIZATION BY MARKET FOR FULL FACTOR STACK FOR MULTIPLE FACTORS

def multi_factor_standartize(df_factor, arr_truncate, ser_weight = False, reuse_outliers = False, center_result = True, within_market = False, 
                             flag_tha = False, flo_similarity = 5 * (10 ** (-8))):
    ### Time-horizon adjusted standartization:
    if (flag_tha):
        dict_standartized = {}
        dict_standartized_tha = {}        
        dict_autocorr_vector = {}
        dict_tha_coeff = {}
        dict_std = {}
        ### Single factor standartizing:
        for iter_factor in df_factor.columns:
            (dict_standartized_tha[iter_factor], dict_standartized[iter_factor], dict_autocorr_vector[iter_factor], dict_tha_coeff[iter_factor], dict_std[iter_factor]) = \
            single_factor_standartize(df_factor[iter_factor], arr_truncate, ser_weight, reuse_outliers, center_result, within_market, flag_tha, flo_similarity)
        ### Concatenating to dataframe:
        tup_result = (pd.concat(dict_standartized_tha, axis = 1), pd.concat(dict_standartized, axis = 1), 
                      pd.concat(dict_autocorr_vector, axis = 1), pd.concat(dict_tha_coeff, axis = 1), pd.concat(dict_std, axis = 1))
        ### Results output:
        return tup_result
    ### Simple standartization:    
    else:
        dict_standartized = {}
        ### Single factor standartizing:
        for iter_factor in df_factor.columns:
            dict_standartized[iter_factor] = single_factor_standartize(df_factor[iter_factor], arr_truncate, ser_weight, 
                                                                       reuse_outliers, center_result, within_market, flag_tha, flo_similarity)
        ### Concatenating to dataframe:
        df_result = pd.concat(dict_standartized, axis = 1)
        ### Results output:
        return df_result

In [16]:
### DEFINING EFFICACY MEASURES FOR SINGLE FACTOR

def single_factor_multiple_efficacy_measures(ser_factor, ser_return, ser_weight, arr_measure, return_shift = 0, arr_truncate = [2.5, 2.0]):
    ### Declaring local constants & variables:
    All = slice(None)
    dict_measure = {}
    set_std_needed = {'fmb_std_eqw', 'fmb_std_weighted'}
    num_precision = 5 # For quintile bins rounding and borders controlling
    ### Defining date index shifting function:
    def date_reindex(iter_group, idx_date_range, return_shift = 0):
        ser_iter_result = iter_group.unstack('Country').reindex(idx_date_range).sort_index().shift(-return_shift).stack('Country', dropna = False).squeeze()
        ### Results output:
        return ser_iter_result    
    ### Defining MatLab style prctile function:
    def prctile(ser_to_perc, p):
        ### Sorted list preparing:
        list_to_perc = ser_to_perc.dropna().values
        list_sorted = np.sort(list_to_perc)
        ### Length calculating:
        num_len = len(list_to_perc)    
        ### Prctile calculating:
        num_result = np.interp(np.array(p), np.linspace(1 / (2 * num_len), (2 * num_len - 1) / (2 * num_len), num_len), list_sorted)
        ### Results output:
        return num_result
    ### Defining quintile bins distribution:
    def quintile_distribution(ser_iter_group, num_bins = 5, bool_right_included = True, bool_include_lowest = True, bool_populate_last = True):
        ### Bins preparing:
        list_bin = list(np.arange(0, 100, 100 / num_bins))[1 : ]
        list_bin = [round(iter_element / 100, 2) for iter_element in list_bin]
        list_bin = [prctile(ser_iter_group, iter_element) for iter_element in list_bin]
        list_bin = [ser_iter_group.min() - abs(ser_iter_group.min())] + list_bin + [ser_iter_group.max() + abs(ser_iter_group.max())]
        list_bin = [round(iter_element, num_precision) for iter_element in list_bin]
        if bool_populate_last:
            list_bin[-2] -= 10 ** (-num_precision)
        ### Bins distribution:
        ser_iter_distribution = pd.cut(ser_iter_group, bins = list_bin, labels = range(num_bins), 
                                       right = bool_right_included, include_lowest = bool_include_lowest, precision = num_precision)
        ### Results output:
        return ser_iter_distribution    
    ### Defining get_measure group level function:
    def get_measure(df_to_measure, iter_measure):
        ### Checking data sufficiency:
        if (len(df_to_measure.dropna().index) > 1):           
            ### Measure calculating:
            if (iter_measure == 'ic_spearman'):
                ### Spearmen information coefficient:
                list_factor = df_to_measure[['Factor', 'Return']].dropna()['Factor'].values
                list_return = df_to_measure[['Factor', 'Return']].dropna()['Return'].values
                num_result = ss.spearmanr(list_factor, list_return, nan_policy = 'omit').correlation
            if (iter_measure == 'ic_pearson'):
                ### Pearson information coefficient:
                list_factor = df_to_measure[['Factor', 'Return']].dropna()['Factor'].values
                list_return = df_to_measure[['Factor', 'Return']].dropna()['Return'].values                
                num_result = ss.pearsonr(list_factor, list_return)[0]
            if (iter_measure == 'fmb_eqw'):
                ### Fama-McBeth cross-sectional regression beta coefficient (equal weighted residuals):
                list_factor_added = df_to_measure[['Factor', 'Constant', 'Return']].dropna()[['Factor', 'Constant']].values
                list_return = df_to_measure[['Factor', 'Constant', 'Return']].dropna()['Return'].values
                wls_model = sm.OLS(endog = list_return, exog = list_factor_added, missing = 'drop', hasconst = False)
                wls_results = wls_model.fit()
                num_result = wls_results.params[0]
            if (iter_measure == 'fmb_weighted'):
                ### Fama-McBeth cross-sectional regression beta coefficient (market capitalization weighted residuals):
                list_factor_added = df_to_measure[['Factor', 'Constant', 'Return', 'Weight']].dropna()[['Factor', 'Constant']].values
                list_return = df_to_measure[['Factor', 'Constant', 'Return', 'Weight']].dropna()['Return'].values                
                list_weight = df_to_measure[['Factor', 'Constant', 'Return', 'Weight']].dropna()['Weight'].values
                wls_model = sm.WLS(endog = list_return, exog = list_factor_added, weights = pow(list_weight, 1/2), missing = 'drop', hasconst = False)
                wls_results = wls_model.fit()
                num_result = wls_results.params[0]
            if (iter_measure == 'fmb_std_eqw'):             
                ### Fama-McBeth cross-sectional regression beta coefficient (market capitalization weighted residuals):
                list_factor_std_added = df_to_measure[['Factor_std', 'Constant', 'Return']].dropna()[['Factor_std', 'Constant']].values
                list_return = df_to_measure[['Factor_std', 'Constant', 'Return']].dropna()['Return'].values                
                wls_model = sm.OLS(endog = list_return, exog = list_factor_std_added, missing = 'drop', hasconst = False)
                wls_results = wls_model.fit()
                num_result = wls_results.params[0]                 
            if (iter_measure == 'fmb_std_weighted'):
                ### Fama-McBeth cross-sectional regression beta coefficient (market capitalization weighted residuals):
                list_factor_std_added = df_to_measure[['Factor_std', 'Constant', 'Return', 'Weight']].dropna()[['Factor_std', 'Constant']].values
                list_return = df_to_measure[['Factor_std', 'Constant', 'Return', 'Weight']].dropna()['Return'].values                
                list_weight = df_to_measure[['Factor_std', 'Constant', 'Return', 'Weight']].dropna()['Weight'].values                
                wls_model = sm.WLS(endog = list_return, exog = list_factor_std_added, weights = pow(list_weight, 1/2), missing = 'drop', hasconst = False)
                wls_results = wls_model.fit()
                num_result = wls_results.params[0]  
            if (iter_measure == 'clp'):
                ### Constant leverage portfolio signed normalized multiplication sum:                
                ser_clp_weighted = df_to_measure[['Factor', 'Return', 'Weight']].dropna()['Factor']
                ser_clp_weighted = ser_clp_weighted * df_to_measure[['Factor', 'Return', 'Weight']].dropna()['Weight'].transform(np.sqrt)
                ser_clp_weighted.loc[ser_clp_weighted < 0] = -ser_clp_weighted / ser_clp_weighted[ser_clp_weighted < 0].sum()
                ser_clp_weighted.loc[ser_clp_weighted > 0] = ser_clp_weighted / ser_clp_weighted[ser_clp_weighted > 0].sum()
                num_result = (ser_clp_weighted * df_to_measure['Return']).sum()
            if ('qtl' in iter_measure):
                ### Interquntile range:
                num_bins = int(iter_measure.split('qtl')[1])   
                df_to_measure = df_to_measure[['Factor', 'Return', 'Constant']].dropna()
                df_to_measure['Return'] = df_to_measure['Return'] - df_to_measure['Return'].mean()
                df_to_measure['Factor'] = df_to_measure['Factor'].round(num_precision)
                ### Distribution factor values between quintile bins:
                ser_qtl_bins = quintile_distribution(df_to_measure['Factor'], num_bins, bool_right_included = True, bool_include_lowest = True, bool_populate_last = True)
                ser_qtl_bins.name = 'Bin'
                ### Mean return for each bin calculating:
                df_to_measure = df_to_measure.join(ser_qtl_bins)
                df_qtl_rets = df_to_measure.loc[(All), ['Return', 'Bin']]
                df_qtl_rets.set_index('Bin', append = True, inplace = True)
                ser_qtl_rets = df_qtl_rets.unstack('Bin').mean(axis = 0).droplevel(0).squeeze()
                num_result = ser_qtl_rets.iloc[-1] - ser_qtl_rets.iloc[0]                                 
        else:                          
            num_result = np.NaN
        ### Preparing results: 
        return num_result
    ### Preparing combined vectors for measures calculating:
    if (ser_weight.count() == 0):
        ser_weight = pd.Series(1, index = ser_factor.index)
    ### Region filter dropping:
    ser_factor = ser_factor.reset_index('Market', drop = True)
    ser_return = ser_return.reset_index('Market', drop = True)
    ser_weight = ser_weight.reset_index('Market', drop = True)
    ### Preparing shifted returns:
    idx_date_range = ser_return.index.get_level_values(0).unique()
    ser_return_shifted = ser_return.groupby('Country', group_keys = False).apply(date_reindex, idx_date_range, return_shift)
    ### Preparing combined vectors for measures calculating:
    df_to_measure = pd.concat([ser_factor, ser_return_shifted, ser_weight], axis = 1)
    df_to_measure.columns = ['Factor', 'Return', 'Weight']
    df_to_measure['Constant'] = 1
    if set_std_needed.intersection(set(arr_measure)):
        ser_factor_std = df_to_measure.dropna()['Factor'].groupby('Date').apply(ison_standartize, arr_truncate = arr_truncate, within_market = False)
        df_to_measure['Factor_std'] = ser_factor_std.reindex(df_to_measure.index)
    ### Looping efficacy measures for calculating measures timeseries:
    for iter_measure in arr_measure:
        dict_measure[iter_measure] = df_to_measure.groupby('Date').apply(get_measure, iter_measure = iter_measure)
    ### Preparing results:
    return pd.concat(dict_measure, axis = 1)

In [17]:
### DEFINING MEASURE STATISTICS CALCULATOR

def measure_stats(df_measures, arr_back_period = [99]):
    ### Declaring local constants & variables:
    dict_stats = {}
    ### Stats calculating:
    for iter_measure in df_measures.columns:
        dict_period = {}
        for iter_back_period in arr_back_period:
            ser_iter_measure = df_measures[iter_measure].dropna()
            idx_iter_range = pd.date_range(end = ser_iter_measure.index[-1], periods = iter_back_period * 12, freq = 'BM')
            ser_iter_measure = ser_iter_measure[idx_iter_range]            
            ser_iter_stats = pd.Series()
            ser_iter_stats['count'] = ser_iter_measure.count()
            ser_iter_stats['min'] = ser_iter_measure.min()
            ser_iter_stats['max'] = ser_iter_measure.max()        
            ser_iter_stats['mean'] = ser_iter_measure.mean()
            ser_iter_stats['std'] = ser_iter_measure.std()
            ser_iter_stats['median'] = ser_iter_measure.median()        
            ser_iter_stats['perc_25'] = ser_iter_measure.quantile(0.25, 'midpoint')
            ser_iter_stats['perc_75'] = ser_iter_measure.quantile(0.75, 'midpoint')
            ser_iter_stats['iq_range'] = ser_iter_measure.quantile(0.75, 'midpoint') - ser_iter_measure.quantile(0.25, 'midpoint')
            ser_iter_stats['mean_abs'] = ser_iter_measure.abs().mean()
            ser_iter_stats['t_stat'] = (ser_iter_measure.mean() / ser_iter_measure.std()) * np.sqrt(ser_iter_measure.count())  
            dict_period[iter_back_period] = ser_iter_stats
        dict_stats[iter_measure] = pd.concat(dict_period, axis = 1)
    ### Preparing results:
    return pd.concat(dict_stats, axis = 1)

In [18]:
### DEFINING SPECIAL CLP STATS

def special_clp_stats(ser_factor, ser_return, ser_weight, return_shift = 0):
    ### Declaring local constants & variables:    
    dict_clp_stats = {}
    list_bin_labels = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']
    ### Defining date index shifting function:
    def date_reindex(iter_group, idx_date_range, return_shift = 0):
        ser_iter_result = iter_group.unstack('Country').reindex(idx_date_range).sort_index().shift(-return_shift).stack('Country', dropna = False).squeeze()
        ### Results output:
        return ser_iter_result    
    ### Defining function for constant leverage portfolio normalizing:
    def get_normalized_clp(df_to_measure):
        ser_clp_weighted = df_to_measure['Factor'] * df_to_measure['Weight'].transform(np.sqrt)
        ### Checking data sufficiency:
        if (ser_clp_weighted.count() > 0):           
            ### Constant leverage portfolio signed normalized:
            ser_clp_weighted.loc[ser_clp_weighted < 0] = -ser_clp_weighted / ser_clp_weighted[ser_clp_weighted < 0].sum()
            ser_clp_weighted.loc[ser_clp_weighted > 0] = ser_clp_weighted / ser_clp_weighted[ser_clp_weighted > 0].sum()
            ser_result = ser_clp_weighted.copy()
        else:
            ser_result = pd.Series(np.NaN, index = ser_clp_weighted.index)
        ### Results output:
        return ser_result
    ### Defining function for constant leverage portfolio normalizing:
    def get_normalized_factor(df_to_measure):
        ser_clp_weighted = df_to_measure['Factor'] * df_to_measure['Weight'].transform(np.sqrt)
        ### Checking data sufficiency:
        if (ser_clp_weighted.count() > 0):           
            ### Factor signed normalized:
            ser_factor_normalized = df_to_measure['Factor']
            ser_factor_normalized.loc[ser_factor_normalized < 0] = -ser_factor_normalized / ser_factor_normalized[ser_factor_normalized < 0].sum()
            ser_factor_normalized.loc[ser_factor_normalized > 0] = ser_factor_normalized / ser_factor_normalized[ser_factor_normalized > 0].sum()
            ser_result = ser_factor_normalized
        else:
            ser_result = pd.Series(np.NaN, index = ser_clp_weighted.index) 
        ### Results output:
        return ser_result            
    ### Defining function for returns for constant leverage portfolio:
    def get_normalized_return(df_to_measure):
        ser_clp_weighted = df_to_measure['Factor'] * df_to_measure['Weight'].transform(np.sqrt)
        ### Checking data sufficiency:
        if (ser_clp_weighted.count() > 0):           
            ### Normalized return:  
            ser_result = df_to_measure['Return']
        else:
            ser_result = pd.Series(np.NaN, index = ser_clp_weighted.index)            
        ### Results output:
        return ser_result  
    ### Defining MatLab style prctile function:
    def prctile(ser_to_perc, p):
        ### Sorted list preparing:
        list_to_perc = ser_to_perc.dropna().values
        list_sorted = np.sort(list_to_perc)
        ### Length calculating:
        num_len = len(list_to_perc)    
        ### Prctile calculating:
        num_result = np.interp(np.array(p), np.linspace(1 / (2 * num_len), (2 * num_len - 1) / (2 * num_len), num_len), list_sorted)
        ### Results output:
        return num_result
    ### Defining quintile bins distribution:
    def quintile_distribution(ser_iter_group, num_bins = 5, bool_right_included = True, bool_include_lowest = True, bool_populate_last = True):
        ### Bins preparing:
        list_bin = list(np.arange(0, 100, 100 / num_bins))[1 : ]
        list_bin = [round(iter_element / 100, 2) for iter_element in list_bin]
        list_bin = [prctile(ser_iter_group, iter_element) for iter_element in list_bin]
        list_bin = [ser_iter_group.min() - abs(ser_iter_group.min())] + list_bin + [ser_iter_group.max() + abs(ser_iter_group.max())]
        list_bin = [round(iter_element, num_precision) for iter_element in list_bin]
        if bool_populate_last:
            list_bin[-2] -= 10 ** (-num_precision)
        ### Bins distribution:
        ser_iter_distribution = pd.cut(ser_iter_group, bins = list_bin, labels = range(num_bins), 
                                       right = bool_right_included, include_lowest = bool_include_lowest, precision = num_precision)
        ### Results output:
        return ser_iter_distribution 
    ### Preparing combined vectors for measures calculating:
    if (ser_weight.count() == 0):
        ser_weight = pd.Series(1, index = ser_factor.index)
    ### Region filter dropping:
    ser_factor = ser_factor.reset_index('Market', drop = True)
    ser_return = ser_return.reset_index('Market', drop = True)
    ser_weight = ser_weight.reset_index('Market', drop = True)      
    ### Preparing shifted returns:
    idx_date_range = ser_return.index.get_level_values(0).unique()
    ser_return_shifted = ser_return.groupby('Country', group_keys = False).apply(date_reindex, idx_date_range, return_shift)
    ### Preparing combined vectors for measures calculating:
    df_to_measure = pd.concat([ser_factor, ser_return_shifted, ser_weight], axis = 1)
    df_to_measure.columns = ['Factor', 'Return', 'Weight']
    ser_clp_normalized = df_to_measure.dropna().groupby('Date', group_keys = False).apply(get_normalized_clp)
    ser_factor_normalized = df_to_measure.dropna().groupby('Date', group_keys = False).apply(get_normalized_factor)
    ser_return_normalized = df_to_measure.dropna().groupby('Date', group_keys = False).apply(get_normalized_return)
    ### CLP stats calculating:
    dict_clp_stats['Average Bias'] = ser_factor_normalized.groupby('Country').mean()
    dict_clp_stats['Weights Sum'] = ser_clp_normalized.groupby('Country').sum()
    dict_clp_stats['Average Return'] = ser_return_normalized.groupby('Country').mean()   
    dict_clp_stats['Static Contribution'] = dict_clp_stats['Weights Sum'] * dict_clp_stats['Average Return']
    dict_clp_stats['Total Contribution'] = (ser_clp_normalized * ser_return_normalized).groupby('Country').sum()
    dict_clp_stats['Dynamic Contribution'] = dict_clp_stats['Total Contribution'] - dict_clp_stats['Static Contribution'] 
    ### CLP active weights calculating:
    ser_clp_delta = ser_clp_normalized.unstack('Date').stack('Date', dropna = False).swaplevel().sort_index(level = ['Date', 'Country'])
    ser_clp_delta = ser_clp_delta.fillna(0)
    num_clp_mean = ser_clp_delta.groupby('Country').mean().abs().sum()
    ser_clp_delta = ser_clp_delta.groupby('Country').apply(lambda iter_group: iter_group - iter_group.mean())
    num_clp_delta = (ser_clp_delta.abs().groupby('Date').sum() / (ser_clp_delta.abs().groupby('Date').sum() + num_clp_mean)).mean()
    df_clp_stats = pd.concat(dict_clp_stats, axis = 1).reindex(df_to_measure.index.get_level_values('Country').unique()).sort_index()
    ### Preparing sum:
    df_clp_sum = pd.DataFrame([[np.NaN, np.NaN, np.NaN, df_clp_stats['Static Contribution'].sum(), 
                               df_clp_stats['Total Contribution'].sum(), df_clp_stats['Dynamic Contribution'].sum()]], 
                              index = ['Sum'], columns = df_clp_stats.columns)
    ### Preparing expected based on active weights:
    df_clp_expected = pd.DataFrame([[np.NaN, np.NaN, np.NaN, 
                                     df_clp_stats['Total Contribution'].sum() * (1 - num_clp_delta), np.NaN, df_clp_stats['Total Contribution'].sum() * num_clp_delta]], 
                              index = ['Expected based on active weights =>'], columns = df_clp_stats.columns)    
    ### Adding totals:
    df_clp_stats = pd.concat([df_clp_stats, df_clp_sum, df_clp_expected], axis = 0, join = 'inner')
    ### CLP Bias calculating:   
    ser_clp_quintile = ser_clp_normalized.groupby('Date', group_keys = False).apply(quartile_distribution)
    df_clp_bias = ser_clp_quintile.to_frame()  
    df_clp_bias.columns = ['Bin']
    df_clp_bias['Quintile'] = 1
    df_clp_bias = df_clp_bias.set_index('Bin', append = True).unstack('Bin').fillna(0).droplevel(level = 0, axis = 1)
    df_clp_bias.columns = list(df_clp_bias.columns)  
    df_clp_bias = df_clp_bias[list_bin_labels]
    df_clp_bias = df_clp_bias.groupby('Country').mean()
    df_clp_bias.loc[:, 'Q5 - Q1'] = df_clp_bias['Q5'] - df_clp_bias['Q1']   
    df_clp_bias = df_clp_bias.reindex(df_to_measure.index.get_level_values('Country').unique()).sort_index()
    df_clp_bias = df_clp_bias
    ### Output results:
    return (df_clp_stats, df_clp_bias)

In [19]:
### DEFINING SPECIAL QTL STATS

def special_qtl_stats(ser_factor, ser_return, return_shift = 0, num_bins = 10):
    ### Declaring local constants & variables:
    All = slice(None)
    num_precision = 5 # For quintile bins rounding and borders controlling
    ### Defining date index shifting function:
    def date_reindex(iter_group, idx_date_range, return_shift = 0):
        ser_iter_result = iter_group.unstack('Country').reindex(idx_date_range).sort_index().shift(-return_shift).stack('Country', dropna = False).squeeze()
        ### Results output:
        return ser_iter_result    
    ### Defining MatLab style prctile function:
    def prctile(ser_to_perc, p):
        ### Sorted list preparing:
        list_to_perc = ser_to_perc.dropna().values
        list_sorted = np.sort(list_to_perc)
        ### Length calculating:
        num_len = len(list_to_perc)    
        ### Prctile calculating:
        num_result = np.interp(np.array(p), np.linspace(1 / (2 * num_len), (2 * num_len - 1) / (2 * num_len), num_len), list_sorted)
        ### Results output:
        return num_result
    ### Defining quintile bins distribution:
    def quintile_distribution(ser_iter_group, num_bins = 5, bool_right_included = True, bool_include_lowest = True, bool_populate_last = True):
        ### Bins preparing:
        list_bin = list(np.arange(0, 100, 100 / num_bins))[1 : ]
        list_bin = [round(iter_element / 100, 2) for iter_element in list_bin]
        list_bin = [prctile(ser_iter_group, iter_element) for iter_element in list_bin]
        list_bin = [ser_iter_group.min() - abs(ser_iter_group.min())] + list_bin + [ser_iter_group.max() + abs(ser_iter_group.max())]
        list_bin = [round(iter_element, num_precision) for iter_element in list_bin]
        if bool_populate_last:
            list_bin[-2] -= 10 ** (-num_precision)
        ### Bins distribution:
        ser_iter_distribution = pd.cut(ser_iter_group, bins = list_bin, labels = range(num_bins), 
                                       right = bool_right_included, include_lowest = bool_include_lowest, precision = num_precision)
        ### Results output:
        return ser_iter_distribution    

    ### Defining qts stats generator:
    def get_qtl_stats(df_to_measure):
        df_to_measure = df_to_measure[['Factor', 'Return', 'Constant']].dropna()
        if len(df_to_measure.index):
            df_to_measure['Return'] = df_to_measure['Return'] - df_to_measure['Return'].mean()
            df_to_measure['Factor'] = df_to_measure['Factor'].round(num_precision)
            ### Distribution factor values between quintile bins:
            ser_qtl_bins = quintile_distribution(df_to_measure['Factor'], num_bins, bool_right_included = True, bool_include_lowest = True, bool_populate_last = True)
            ser_qtl_bins.name = 'Bin'
            df_to_measure = df_to_measure.join(ser_qtl_bins)        
            ### Bins distribution:
            df_qtl_bins = df_to_measure.loc[(All), ['Constant', 'Bin']]
            df_qtl_bins.set_index('Bin', append = True, inplace = True)
            ser_qtl_bins = df_qtl_bins.unstack('Bin').sum(axis = 0).droplevel(0).squeeze()
            ### Return mean for each bin:
            df_qtl_rets = df_to_measure.loc[(All), ['Return', 'Bin']]
            df_qtl_rets.set_index('Bin', append = True, inplace = True)
            ser_qtl_rets = df_qtl_rets.unstack('Bin').mean(axis = 0).droplevel(0).squeeze()
        else:
            ser_qtl_bins = pd.Series(np.NaN, index = range(num_bins))
            ser_qtl_rets = pd.Series(np.NaN, index = range(num_bins))            
        ### Results output:
        ser_qtl_bins.name = 'Distribution'                
        ser_qtl_rets.name = 'Mean return'
        return pd.concat([ser_qtl_bins, ser_qtl_rets], axis = 0)      
        
    ### Region filter dropping:
    ser_factor = ser_factor.reset_index('Market', drop = True)
    ser_return = ser_return.reset_index('Market', drop = True)
    ### Preparing shifted returns:
    idx_date_range = ser_return.index.get_level_values(0).unique()
    ser_return_shifted = ser_return.groupby('Country', group_keys = False).apply(date_reindex, idx_date_range, return_shift)
    ### Preparing combined vectors for measures calculating:
    df_to_measure = pd.concat([ser_factor, ser_return_shifted], axis = 1)
    df_to_measure.columns = ['Factor', 'Return']
    df_to_measure['Constant'] = 1
    df_bins_stats = df_to_measure.groupby('Date', group_keys = False).apply(get_qtl_stats)
    ### Output results:
    df_bins_distribution = df_bins_stats.iloc[All, : num_bins]
    df_bins_return_mean = df_bins_stats.iloc[All, num_bins :]    
    return (df_bins_distribution, df_bins_return_mean)

In [20]:
### DEFINING SINGLE EFFICACY MEASURE FOR MULTIPLE FACTORS
    
def multiple_factor_single_efficacy_measure_stats(df_factors, ser_return, ser_weight, str_measure, num_back_period = 99, 
                                                  num_horizon = 12, list_region_xmo = ['DM', 'EM', 'FM']): 
    ### Declaring local constants & variables:
    All = slice(None)
    list_months = [1, 2, 3, 6, 9 ,12]
    list_range = [iter_month - 1 for iter_month in list_months if iter_month <= num_horizon]
    ### Defining full universe expanding for date:
    def universe_reindex(iter_group, idx_universe):
        df_iter_result = iter_group.unstack('Date').reindex(idx_universe).sort_index().stack('Date', dropna = False)
        ### Results output:
        return df_iter_result   
    ### Defining date index shifting function:
    def date_reindex(iter_group, idx_date_range, return_shift = 0):
        ser_iter_result = iter_group.unstack('Country').reindex(idx_date_range).sort_index().shift(-return_shift).stack('Country', dropna = False).squeeze()
        ### Results output:
        return ser_iter_result   
    ### Defining by date correrlation function:
    def corr_by_date(iter_group):
        num_iter_corr = iter_group['Corr_factor_minus'].corr(iter_group['Corr_factor_plus'])
        ### Results output:
        return num_iter_corr       
    ### Preparing expanded universe for autocorrelation performing:
    df_factors_region = df_factors.loc[(All, All, list_region_xmo), :]
    idx_date_range = df_factors_region.index.get_level_values(0).unique()
    idx_universe = df_factors_region.index.get_level_values(1).unique()
    df_factors_full = df_factors_region.reset_index('Market', drop = True).groupby('Date', group_keys = False).apply(universe_reindex, idx_universe).swaplevel()
    ### Factors looping:
    dict_factors_measures = {} ### Container for all factor measure stats
    dict_factors_vectors = {} ### Container for all factor measure vectors
    dict_factors_autocorr = {} ### Container for autocorrelation results
    for iter_factor in df_factors.columns:
        ### Shifts looping for factors measures stats:
        ### Stats calculation:
        dict_factor_stats = {} ### Container for iterated factor stats
        dict_factor_vectors = {} ### Container for iterated factor measure vectors
        for iter_shift in list_range:            
            df_factor_filtered = df_factors_region[iter_factor]
            df_iter_shift_measure = single_factor_multiple_efficacy_measures(df_factor_filtered, ser_return, ser_weight, [str_measure], iter_shift, list_truncate)
            df_iter_shift_stats = measure_stats(df_iter_shift_measure, [num_back_period])
            dict_factor_stats[iter_shift] = df_iter_shift_stats.loc[['mean', 't_stat'], (str_measure, num_back_period)]
            dict_factor_vectors[iter_shift] = df_iter_shift_measure
        ### Aggegating factor measure stats:
        df_iter_factor_stats = pd.concat(dict_factor_stats, axis = 1)
        df_iter_factor_stats.columns = df_iter_factor_stats.columns + 1        
        dict_factors_measures[iter_factor] = df_iter_factor_stats
        ### Aggegating factor measure vectors:        
        df_iter_factor_vectors = pd.concat(dict_factor_vectors, axis = 1)    
        df_iter_factor_vectors.columns = df_iter_factor_vectors.columns.droplevel(1) + 1
        dict_factors_vectors[iter_factor] = df_iter_factor_vectors        
        ### Autocorrelation calculation:
        ser_iter_factor = df_factors_full[iter_factor]
        ser_iter_factor_plus = ser_iter_factor.groupby('Country', group_keys = False).apply(lambda iter_group: iter_group.iloc[1 : ]).\
                                sort_index(level = ['Date', 'Country'])
        ser_iter_factor_minus = ser_iter_factor.groupby('Country', group_keys = False).apply(lambda iter_group: iter_group.iloc[: -1]).\
                                sort_index(level = ['Date', 'Country'])
        ### Artificial series combining for indices synchronization:        
        ser_iter_factor_plus_shifted = ser_iter_factor_plus.groupby('Country', group_keys = False).apply(date_reindex, idx_date_range, 1)
        df_iter_factor_to_corr = pd.concat([ser_iter_factor_minus, ser_iter_factor_plus_shifted], axis = 1)
        df_iter_factor_to_corr.columns = ['Corr_factor_minus', 'Corr_factor_plus']
        dict_factors_autocorr[iter_factor] = pd.Series(df_iter_factor_to_corr.groupby('Date').apply(corr_by_date).mean(), index = ['autocorr'])
    ### Results output:
    df_factors_measures_stats = pd.concat(dict_factors_measures, axis = 0)
    df_factors_autocorr =  pd.concat(dict_factors_autocorr, axis = 1).transpose()
    df_factors_vectors = pd.concat(dict_factors_vectors, axis = 0)
    df_factors_coeff = df_factors_measures_stats.loc[(All, 'mean'), All].reset_index(1, drop = True)    
    df_factors_coeff.columns = [('coeff_' + str(iter_column)) for iter_column in df_factors_coeff.columns]
    df_factors_t_stat = df_factors_measures_stats.loc[(All, 't_stat'), All].reset_index(1, drop = True)    
    df_factors_t_stat.columns = [('t_' + str(iter_column)) for iter_column in df_factors_t_stat.columns]
    df_factors_result = pd.concat([df_factors_autocorr, df_factors_coeff, df_factors_t_stat], axis = 1)    
    return (df_factors_result, df_factors_vectors)

In [21]:
### DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE

def ison_membership_converting(str_path_universe, date_end, bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(io = str_path_universe, sheet_name = 0, header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

In [22]:
### TESTING: CONSTANTS INITIALIZATION

str_test_date = '2008-02-29'
list_test_country = ['BH', 'OM'] # All
str_test_region = All # 'EM'
str_test_eer = 'NEER'

In [23]:
### MAIN SCRIPT: BLOOMBERG STRUCTURED DATA & ISON MEMBERSHIP EXTRACTION (NO PRELIMINARY DATA USING)

ser_returns = pd.read_hdf(str_path_bb_hdf, key = str_key_ret_monthly)
ser_mmr = pd.read_hdf(str_path_bb_hdf, key = str_key_mmr)
ser_fx_country = pd.read_hdf(str_path_bb_hdf, key = str_key_fx_country)
ser_fx_rate_demeaned = pd.read_hdf(str_path_bb_hdf, key = str_key_fx_demeaned)
ser_mcap = pd.read_hdf(str_path_bb_hdf, key = str_key_mcap)
ser_reer = pd.read_hdf(str_path_bb_hdf, key = str_key_reer)
ser_neer = pd.read_hdf(str_path_bb_hdf, key = str_key_neer)
ser_reer_sourced = pd.read_hdf(str_path_bb_hdf, key = str_key_reer_sourced)
ser_neer_sourced = pd.read_hdf(str_path_bb_hdf, key = str_key_neer_sourced)
df_xcra_filled = pd.read_hdf(str_path_bb_hdf, key = str_key_xcra)
ser_ison = ison_membership_converting(str_path_universe, datetime.strptime(str_measure_date_end, '%Y-%m-%d'))

In [37]:
### MAIN SCRIPT: DATA PREPARING (NO PRELIMINARY DATA USING)

### List of countries with de-facto equal returns (to impact on hedged returns calculating)
ser_ret_similarity_test = ser_returns.unstack('Currency').groupby('Country').apply(lambda df_country: (df_country['LOC'] - df_country['USD']).abs().mean())
set_ret_usd_only = set(ser_ret_similarity_test.loc[ser_ret_similarity_test < flo_returns_similarity].index)
### List of countries with unsufficient data quantity:
ser_ret_completeness_test = ser_returns.groupby('Country').apply(lambda ser_country: ser_country.count() / len(ser_country.index))
set_not_complete = set(ser_ret_completeness_test.loc[ser_ret_completeness_test < flo_returns_completeness].index)
### Filtering uncomplete countries:
ser_returns.loc[All, All, set_not_complete] = np.NaN
### Returns options preparing:
dict_ser_ret = {}
### Returns in local currency:
dict_ser_ret['LOC'] = ser_returns.loc['LOC', All, All].droplevel(0)
### Returns in USD:
dict_ser_ret['USD'] = ser_returns.loc['USD', All, All].droplevel(0)
### Hedged returns in local currency:
dict_ser_hedged = {}
### Filling data for countries with no MMR data:
set_ison_countries = set(dict_ser_ret['LOC'].index.get_level_values(1).unique())
set_mmr_countries = set(ser_mmr.index.get_level_values(1).unique())
set_no_mmr_countries = (set_ison_countries - set_mmr_countries) | set_ret_usd_only
set_to_hedge_countries = set_mmr_countries - set_no_mmr_countries
dict_ser_hedged['No_MMR'] = dict_ser_ret['USD'].loc[All, set_no_mmr_countries]
### Money Market rates shifting forward:
ser_mmr_shifted = ser_mmr.groupby('Country').shift(1)
### Filling data for other countries:
df_ser_hedged = pd.DataFrame()
df_ser_hedged['Returns LOC'] = dict_ser_ret['LOC'].loc[All, set_to_hedge_countries]
df_ser_hedged = df_ser_hedged.join(ser_mmr_shifted, how = 'left')
df_ser_hedged.columns = ['Returns LOC', 'MMR LOC']
dict_ser_hedged['MMR_Based'] = df_ser_hedged.groupby('Country', group_keys = False)\
                               .apply(lambda df_country: (1 + df_country['Returns LOC']) * (1 + ser_mmr_shifted.loc[All, 'US'] / 12) / (1 + df_country['MMR LOC'] / 12) - 1)
#dict_ser_hedged['MMR_Based'] = df_ser_hedged.groupby('Country', group_keys = False)\
#                               .apply(lambda df_country: (1 + df_country['Returns LOC']) * (((1 + ser_mmr.loc[All, 'US']) / (1 + df_country['MMR LOC'])) ** (1 /12)) - 1)
### Aggregating hedged returns:
dict_ser_ret['HEDGED'] = pd.concat(dict_ser_hedged).droplevel(0).sort_index()
### Effective exchange rates options preparing:
dict_ser_eer = {}
### Sources forward filling and reindexing:
ser_reer_source = ser_reer.unstack('Country').reindex(idx_source_date_range).ffill(limit = int_eer_fill_limit).stack('Country').sort_index()
ser_neer_source = ser_neer.unstack('Country').reindex(idx_source_date_range).ffill(limit = int_eer_fill_limit).stack('Country').sort_index()
ser_fx_source = ser_fx_rate_demeaned.unstack('Country').reindex(idx_source_date_range).ffill(limit = int_eer_fill_limit).stack('Country').sort_index()
ser_export_source = df_xcra_filled['Exports'].unstack('Country').reindex(idx_source_date_range).ffill(limit = int_eer_fill_limit).stack('Country').sort_index()
dict_ser_eer['REER'] = ser_reer_source
if bool_neer_raw:
    ### Simple NEER usage:
    dict_ser_eer['NEER'] = ser_neer_source
else:
    ### Alternative NEER usage:
    ### Selecting all ISON countries:
    set_ison = set(ser_ison.dropna().index.get_level_values('Country').unique())
    ### Selecting all REER countries:
    set_reer_all = set(ser_reer.dropna().index.get_level_values('Country').unique())
    ### Selecting all NEER countries:
    set_neer_all = set(ser_neer.dropna().index.get_level_values('Country').unique())
    ### Selecting countries, where REER has monthly frequency:
    set_reer_monthly = set(ser_reer_sourced.loc[All, All, ['IMF', 'BIS']].index.get_level_values(1).unique())
    ### Defining countries from REER to participate in NEER source:
    set_reer_st = set_reer_all - set_reer_monthly
    ### Defining countries from NEER to participate in NEER source:
    ser_neer_st = set_reer_monthly & set_neer_all
    ### Defining rest of countries to participate in NEER source from FX rates:
    set_fx_st = set_ison - (set_reer_st | ser_neer_st)
    ### Converting sets to lists:
    list_reer_st = sorted(list(set_reer_st))
    list_neer_st = sorted(list(ser_neer_st))
    list_fx_st = sorted(list(set_fx_st))
    dict_ser_eer['NEER'] = pd.concat([ser_reer_source.loc[All, list_reer_st], ser_neer_source.loc[All, list_neer_st], ser_fx_source.loc[All, list_fx_st]]).sort_index()
dict_ser_eer['EXPORT'] = ser_export_source    
### Concepts options preparing:
dict_ser_concept = {}
### XCRA concept data shifting:
df_xcra_shifted = df_xcra_filled.groupby('Country').shift(int_concept_lag)
### XCRA concepts calculating:
#dict_ser_concept['EXPIMP_GDP_rate'] = (df_xcra_shifted['Imports'] + df_xcra_shifted['Exports']) / df_xcra_shifted['GDP']
dict_ser_concept['EXP_GDP_rate'] = df_xcra_shifted['Exports'] / df_xcra_shifted['GDP']
#dict_ser_concept['CA_GDP_rate'] = df_xcra_shifted['Current Account'] / df_xcra_shifted['GDP']
### XCRA concepts adjusting:
for iter_concept in dict_ser_concept:
    dict_ser_concept[iter_concept] = dict_ser_concept[iter_concept] / int_concept_divider
### XCRA concepts adjusting:
for iter_concept in dict_ser_concept:
    dict_ser_concept[iter_concept].loc[dict_ser_concept[iter_concept] <= -1] = -0.99
    dict_ser_concept[iter_concept] = np.maximum(int_concept_min, (np.minimum(int_concept_max, np.log(1 + dict_ser_concept[iter_concept]))))      
### Neutral concept adding:
dict_ser_concept['NO_CONCEPT'] = pd.Series(1, index = dict_ser_concept['EXP_GDP_rate'].index)
### Concept series renaming:
for iter_concept in dict_ser_concept:
    dict_ser_concept[iter_concept].name = 'Multiplicator'

In [25]:
### TESTING: SOURCE DISTRIBUTION EXPORT:

#pd.concat([pd.Series('REER', index = list_reer_st), pd.Series('NEER', index = list_neer_st), pd.Series('FX', index = list_fx_st)], axis = 0).sort_index()\
#  .to_excel('Data_Files/Test_Files/ST_EER_Sources.xlsx', merge_cells = False)

#ser_reer_sourced.loc['2020-08-31', All, All].droplevel(0).reset_index('Source').drop('EER', axis = 1).squeeze()\
#  .to_excel('Data_Files/Test_Files/LT_EER_Sources.xlsx', merge_cells = False)

In [26]:
### TESTING: MINIMAL VALUE CONTROL:

ser_concept_test = df_xcra_shifted['Exports'] / df_xcra_shifted['GDP'] / int_concept_divider
ser_logged_test = np.minimum(int_concept_max, np.log(1 + ser_concept_test))
ser_logged_test[ser_logged_test == int_concept_max].count() / ser_logged_test.count()

0.3544357469015003

In [27]:
### MAIN SCRIPT: FACTORS CALCULATING (NO PRELIMINARY DATA USING)

### Defining the way to mount the concept multiplicator to the basis factor:
def value_to_rank(ser_group, int_scale):
    ser_result = np.maximum(0, (ser_group - ser_group.min()) / (ser_group.max() - ser_group.min()) * int_scale) + 1
    return ser_result
### Containers for preliminary data:
dict_trans_ret_hdf = {}
dict_trans_mcap_hdf = {}
dict_trans_factor_hdf = {}
dict_test_factor_raw = {}
dict_test_factor_std = {}
### Factors looping:
for iter_factor in list(dict_combinations.keys())[ : -2]: # ['SHORT_TERM_MIXED']: # 
    ### Parameters loading:
    iter_term = dict_combinations[iter_factor][0]
    iter_algo = dict_combinations[iter_factor][1]
    iter_concept = dict_combinations[iter_factor][2]
    iter_eer = dict_combinations[iter_factor][3]
    iter_ret = dict_combinations[iter_factor][4]    
    print(f'{iter_term} / {iter_algo} / {iter_concept} / {iter_eer} / {iter_ret}')                
    ### Iteration data loading:
    ser_iter_ret = dict_ser_ret[iter_ret]
    ser_iter_concept = dict_ser_concept[iter_concept]
    ser_iter_eer = dict_ser_eer[iter_eer]
    ### Factor matrix creating:
    ser_iter_factor = pd.Series(index = pd.MultiIndex.from_product([idx_measure_date_range, ser_ison.index.get_level_values(1).unique()])).sort_index()
    ser_iter_factor.index.set_names(['Date', 'Country'], inplace = True)                
    ### Mean factor calculating:
    if (iter_algo == 'MEAN'):
        ### Mean momentum parameters:
        int_iter_numer_win = dict_numer_ma_win[iter_term]
        int_iter_denom_win = dict_denom_ma_win[iter_term]
        int_iter_numer_min = dict_numer_ma_min[iter_term]
        int_iter_denom_min = dict_denom_ma_min[iter_term]                    
        ### Mean factor calculation:
        ser_iter_factor = -ser_iter_factor.groupby('Country').transform(rolling_cond_mean_momentum, ser_iter_eer, 
                                                                        int_iter_numer_win, int_iter_numer_min, int_iter_denom_win, int_iter_denom_min)
    else:       
        ser_iter_delta = ser_iter_eer.groupby('Country').diff() / ser_iter_eer.groupby('Country').shift()   
        ser_iter_delta = ser_iter_delta.replace([np.inf, -np.inf], np.NaN)
        ### Extremum FX returns zeroing in case of shotr-treem factor:
        if (iter_factor == 'SHORT_TERM_MIXED'):
            ser_iter_delta.loc[All, list_fx_st] = ser_iter_delta.loc[All, list_fx_st]\
                                                            .where(((ser_iter_delta >= list_extreme_boundaries[0]) & (ser_iter_delta <= list_extreme_boundaries[1])), 0.0)
        ### Momentum parameters:
        int_mom_hl = dict_mom_hl[iter_term] * flo_exp_weight_month
        int_mom_win = int_mom_length * ser_work_periods['Year', 'D']
        int_mom_min = dict_mom_min[iter_term]
        ### Weights array:
        list_weight = list(map(lambda iter_num: exp_weight_single(int_mom_hl, iter_num), range(int_mom_win)))[::-1]
        ### Momentum factor calculation:
        ser_iter_factor = ser_iter_factor.groupby('Country').transform(rolling_cond_weighted_mean, ser_iter_delta, int_mom_win, int_mom_min, list_weight, False)
        ### Factor ISONing:
        ser_iter_factor = ser_iter_factor.to_frame().join(ser_ison, how = 'left').set_index('Market', append = True).squeeze()
        ser_iter_factor.name = 'Factor'
    ### Returns shifting and ISONing:
    ser_iter_ret = ser_iter_ret.groupby('Country').shift(periods = -1).to_frame().join(ser_ison, how = 'left').set_index('Market', append = True).squeeze()
    ### Concept multiplicator ISONing:
    ser_iter_concept = ser_iter_concept.to_frame().join(ser_ison, how = 'left').set_index('Market', append = True).squeeze()
    ### Regions clearing:
    ser_iter_ret = ser_iter_ret.loc[idx_measure_date_range, All, list_ison]
    ser_iter_mcap = ser_mcap.loc[idx_measure_date_range, All, list_ison]
    ser_iter_factor = ser_iter_factor.loc[idx_measure_date_range, All, list_ison]
    ser_iter_concept = ser_iter_concept.loc[idx_measure_date_range, All, list_ison]
    ### Countries filtering:
    ser_iter_ret = ser_iter_ret.drop(list_countries_to_exclude, level = 'Country')
    ser_iter_mcap = ser_iter_mcap.drop(list_countries_to_exclude, level = 'Country')
    ser_iter_factor = ser_iter_factor.drop(list_countries_to_exclude, level = 'Country') 
    ser_iter_concept = ser_iter_concept.drop(list_countries_to_exclude, level = 'Country')
    ### Factor and Multiplicator standartizing (Multiplicator shifting), multiplying and restandartizing:
    dict_test_factor_raw[iter_factor] = ser_iter_factor
    dict_test_factor_std[iter_factor] = dict_factors_signs[iter_factor] * single_factor_standartize(ser_iter_factor, list_truncate, within_market = bool_within_market)
    ser_iter_factor_std = dict_factors_signs[iter_factor] \
                          * single_factor_standartize(ser_iter_factor, list_truncate, within_market = bool_within_market, flag_tha = 'monthly')[0]    
    ser_iter_factor_std.name = 'Factor'  
    if (iter_concept != 'NO_CONCEPT'):
#        ser_iter_concept_std = single_factor_standartize(ser_iter_concept, list_truncate, within_market = bool_within_market)
#        ser_iter_multiplied = ser_iter_factor_std * ser_iter_concept_std.groupby(['Date', 'Market']).transform(value_to_rank, 2 * list_truncate[-1])
        ser_iter_multiplied = ser_iter_factor_std * ser_iter_concept        
    ### Preliminary results saving:
    str_iter_key = '__'.join([iter_term, iter_algo, iter_concept, iter_eer, iter_ret])
    dict_trans_ret_hdf[str_iter_key] = ser_iter_ret
    dict_trans_mcap_hdf[str_iter_key] = ser_iter_mcap
    dict_trans_factor_hdf[str_iter_key] = ser_iter_multiplied                     

LONG_TERM / MOMENTUM / EXP_GDP_rate / REER / HEDGED
SHORT_TERM / MOMENTUM / EXP_GDP_rate / NEER / HEDGED
LONG_TERM / MOMENTUM / EXP_GDP_rate / EXPORT / HEDGED


In [89]:
### TESTING: CONTROL OF EXTREMUM VALUES CLEARING:

#idx_test = ser_iter_delta.loc[All, list_reer_st].loc[(ser_iter_delta < list_extreme_boundaries[0]) | (ser_iter_delta > list_extreme_boundaries[1])].index
#print(ser_iter_delta.loc[idx_test])
#ser_iter_delta.loc[All, list_reer_st] = ser_iter_delta.loc[All, list_reer_st]\
#                                                      .where(((ser_iter_delta >= list_extreme_boundaries[0]) & (ser_iter_delta <= list_extreme_boundaries[1])), 0.0)
#print(ser_iter_delta.loc[idx_test])
#ser_iter_delta[ser_iter_delta != 0]

            Country
1994-01-05  AR        -0.001018
            AU         0.001362
            BG        -0.021246
            BR         0.004516
            CA        -0.002680
                         ...   
2020-08-31  UG        -0.019174
            US        -0.000927
            VN        -0.010954
            ZA        -0.008353
            ZM        -0.007076
Length: 513969, dtype: float64

In [28]:
### TESTING: STANDALONE FACTORS EXPORT

#pd.concat(dict_test_factor_raw, axis = 1).to_excel('Data_Files/Test_Files/Revision_EER_factors.xlsx', merge_cells = False)
pd.concat(dict_test_factor_std, axis = 1).to_excel('Data_Files/Test_Files/Revision_EER_factors.xlsx', merge_cells = False)

In [29]:
### MAIN SCRIPT: COMBINED FACTORS CALCULATION

### Factor weghtings initializing:
dict_weighted_factor = {}
### Multiplying factors by averaging weights:
for iter_factor in list(dict_combinations.keys())[ : -2]: 
    str_iter_key = '__'.join(dict_combinations[iter_factor])
    dict_weighted_factor[iter_factor] = dict_trans_factor_hdf[str_iter_key] * dict_factors_weights[iter_factor]
### Taking first factor name to load Returns and Market Caps
str_standalone_key = list(dict_trans_factor_hdf.keys())[0]
### Aggregated factors looping:
for iter_factor in list(dict_combinations.keys())[-2 : ]:
    ### Key constructing:
    str_iter_key = '__'.join([iter_factor.split('_')[-1]] + ['COMBO'] * 4)
    dict_trans_ret_hdf[str_iter_key] = dict_trans_ret_hdf[str_standalone_key]
    dict_trans_mcap_hdf[str_iter_key] = dict_trans_mcap_hdf[str_standalone_key]
    
    ### Combo factor calculating:
    if (iter_factor.split('_')[-1] == 'DOUBLE'):
        ser_combo_factor = pd.concat([dict_weighted_factor['LONG_TERM_EER'], dict_weighted_factor['SHORT_TERM_MIXED']], axis = 1).mean(axis = 1)
    if (iter_factor.split('_')[-1] == 'TRIPLE'):
        ser_combo_factor = pd.concat(dict_weighted_factor, axis = 1).mean(axis = 1)
    ### Combo factor standartizing:
    dict_trans_factor_hdf[str_iter_key] = single_factor_standartize(ser_combo_factor, list_truncate, within_market = bool_within_market)

In [None]:
### TESTING: COMBINED FACTORS EXPORT

#pd.concat(dict_trans_factor_hdf, axis = 1).to_excel('Data_Files/Test_Files/Revision_EER_factors.xlsx', merge_cells = False)
#pd.concat(dict_trans_factor_hdf, axis = 1).to_excel('Data_Files/Test_Files/Revision_EER_factors.xlsx', merge_cells = False)

In [30]:
### MAIN SCRIPT: SAVING TRANSITIONAL RESULTS (NO PRELIMINARY DATA USING)

for iter_key in dict_trans_factor_hdf:
    dict_trans_ret_hdf[iter_key].to_hdf(str_path_trans_hdf, key = str_key_trans_ret + '__' + iter_key, mode = 'a')
    dict_trans_mcap_hdf[iter_key].to_hdf(str_path_trans_hdf, key = str_key_trans_mcap + '__' + iter_key, mode = 'a')
    dict_trans_factor_hdf[iter_key].to_hdf(str_path_trans_hdf, key = str_key_trans_factor + '__' + iter_key, mode = 'a')     

In [31]:
### TESTING: FACTOR SAVING:

dict_trans_factor_hdf = {}
### Single factors looping:
for iter_factor in dict_combinations.keys(): # ['LONG_TERM']: # ['SHORT_TERM']: # 
    ### Parameters loading:
    iter_term = dict_combinations[iter_factor][0]
    iter_algo = dict_combinations[iter_factor][1]
    iter_concept = dict_combinations[iter_factor][2]
    iter_eer = dict_combinations[iter_factor][3]
    iter_ret = dict_combinations[iter_factor][4]    
    print(f'{iter_term} / {iter_algo} / {iter_concept} / {iter_eer} / {iter_ret}') 
    str_iter_key = '__'.join([iter_term, iter_algo, iter_concept, iter_eer, iter_ret])
    dict_trans_factor_hdf[str_iter_key] = pd.read_hdf(str_path_trans_hdf, key = str_key_trans_factor + '__' + str_iter_key) 
### Data aggregating:
df_factor_values = pd.concat(dict_trans_factor_hdf).to_frame()
df_factor_values.index.rename('KEY', level = 0, inplace = True)
df_factor_values.reset_index('KEY', inplace = True)
df_factor_values['Algorythm'] = df_factor_values['KEY'].str.split('__').str[0]
df_factor_values['Factor'] = df_factor_values['KEY'].str.split('__').str[1]
df_factor_values['Returns'] = df_factor_values['KEY'].str.split('__').str[2]
df_factor_values['Multiplicator'] = df_factor_values['KEY'].str.split('__').str[3]
df_factor_values['EER'] = df_factor_values['KEY'].str.split('__').str[4]
ser_factor_values = df_factor_values.set_index(['Algorythm', 'Factor', 'Returns', 'Multiplicator', 'EER'], append = True).drop('KEY', axis = 1).squeeze()
ser_factor_values.name = 'Factor_Value'
ser_factor_values.to_excel(str_path_factors_xlsx, merge_cells = False)

LONG_TERM / MOMENTUM / EXP_GDP_rate / REER / HEDGED
SHORT_TERM / MOMENTUM / EXP_GDP_rate / NEER / HEDGED
LONG_TERM / MOMENTUM / EXP_GDP_rate / EXPORT / HEDGED
DOUBLE / COMBO / COMBO / COMBO / COMBO
TRIPLE / COMBO / COMBO / COMBO / COMBO


In [23]:
### FACTORS PERFORMING (PRELIMINARY DATA USED)

### Results container:
dict_measure_vectors = {}
dict_measure_stats = {}
### Algorythms looping:
### Factors looping:
for iter_factor in dict_combinations.keys(): # ['LONG_TERM']: # ['SHORT_TERM']: # 
    ### Parameters loading:
    iter_term = dict_combinations[iter_factor][0]
    iter_algo = dict_combinations[iter_factor][1]
    iter_concept = dict_combinations[iter_factor][2]
    iter_eer = dict_combinations[iter_factor][3]
    iter_ret = dict_combinations[iter_factor][4] 
    ### Regions looping:
    for iter_region in [['DM', 'EM', 'FM'], ['DM', 'EM'], ['DM'], ['EM'], ['FM']]: # [['DM', 'EM'], ['DM'], ['EM']]
        ### Transitional data loading, filtering and concatenating:
        str_iter_key = '__'.join([iter_term, iter_algo, iter_concept, iter_eer, iter_ret]) 
        str_iter_res = str_iter_key + '__' + ' & '.join(iter_region)
        print(str_iter_res)
        ser_iter_ret = pd.read_hdf(str_path_trans_hdf, key = str_key_trans_ret + '__' + str_iter_key).loc[All, All, list_filter]
        ser_iter_mcap = pd.read_hdf(str_path_trans_hdf, key = str_key_trans_mcap + '__' + str_iter_key).loc[All, All, list_filter]
        ser_iter_factor = pd.read_hdf(str_path_trans_hdf, key = str_key_trans_factor + '__' + str_iter_key).loc[All, All, list_filter]
        ser_iter_factor.name = str_iter_res
        ### Measure calculating:
        dict_measure_stats[str_iter_res], dict_measure_vectors[str_iter_res] = multiple_factor_single_efficacy_measure_stats(ser_iter_factor.to_frame(), 
                                                                                                                             ser_iter_ret * 100, 
                                                                                                                             ser_iter_mcap,
                                                                                                                             list_measure[0], 
                                                                                                                             list_back_period[0], 
                                                                                                                             int_horizon, 
                                                                                                                             iter_region)      
### Results preparing (stats):
df_factor_measure_stats = pd.concat(dict_measure_stats).droplevel(0)
df_factor_measure_stats.index.name = 'KEY'
df_factor_measure_stats.reset_index(inplace = True)
df_factor_measure_stats['Term'] = df_factor_measure_stats['KEY'].str.split('__').str[0]
df_factor_measure_stats['Algorythm'] = df_factor_measure_stats['KEY'].str.split('__').str[1]
df_factor_measure_stats['Interaction'] = df_factor_measure_stats['KEY'].str.split('__').str[2]
df_factor_measure_stats['Source'] = df_factor_measure_stats['KEY'].str.split('__').str[3]
df_factor_measure_stats['Returns'] = df_factor_measure_stats['KEY'].str.split('__').str[4]
df_factor_measure_stats['Region'] = df_factor_measure_stats['KEY'].str.split('__').str[5]
df_factor_measure_stats = df_factor_measure_stats.set_index(['Term', 'Algorythm', 'Interaction', 'Source', 'Returns', 'Region',]).drop('KEY', axis = 1)
df_factor_measure_stats.to_hdf(str_path_efficacy_hdf, key = str_key_efficacy, mode = 'w')
### Results preparing (vectors):
df_measure_vectors = pd.concat(dict_measure_vectors).droplevel(0)
df_measure_vectors.index.names = ['KEY', 'Date']
df_measure_vectors.reset_index(inplace = True)
df_measure_vectors['Term'] = df_measure_vectors['KEY'].str.split('__').str[0]
df_measure_vectors['Algorythm'] = df_measure_vectors['KEY'].str.split('__').str[1]
df_measure_vectors['Interaction'] = df_measure_vectors['KEY'].str.split('__').str[2]
df_measure_vectors['Source'] = df_measure_vectors['KEY'].str.split('__').str[3]
df_measure_vectors['Returns'] = df_measure_vectors['KEY'].str.split('__').str[4]
df_measure_vectors['Region'] = df_measure_vectors['KEY'].str.split('__').str[5]
df_measure_vectors = df_measure_vectors.set_index(['Term', 'Algorythm', 'Interaction', 'Source', 'Returns', 'Region', 'Date']).drop('KEY', axis = 1)
df_measure_vectors.to_hdf(str_path_vectors_hdf, key = str_key_efficacy, mode = 'w')

LONG_TERM__MOMENTUM__EXP_GDP_rate__REER__HEDGED__DM & EM & FM
LONG_TERM__MOMENTUM__EXP_GDP_rate__REER__HEDGED__DM & EM
LONG_TERM__MOMENTUM__EXP_GDP_rate__REER__HEDGED__DM
LONG_TERM__MOMENTUM__EXP_GDP_rate__REER__HEDGED__EM
LONG_TERM__MOMENTUM__EXP_GDP_rate__REER__HEDGED__FM
SHORT_TERM__MOMENTUM__EXP_GDP_rate__NEER__HEDGED__DM & EM & FM
SHORT_TERM__MOMENTUM__EXP_GDP_rate__NEER__HEDGED__DM & EM
SHORT_TERM__MOMENTUM__EXP_GDP_rate__NEER__HEDGED__DM
SHORT_TERM__MOMENTUM__EXP_GDP_rate__NEER__HEDGED__EM
SHORT_TERM__MOMENTUM__EXP_GDP_rate__NEER__HEDGED__FM
LONG_TERM__MOMENTUM__EXP_GDP_rate__EXPORT__HEDGED__DM & EM & FM
LONG_TERM__MOMENTUM__EXP_GDP_rate__EXPORT__HEDGED__DM & EM
LONG_TERM__MOMENTUM__EXP_GDP_rate__EXPORT__HEDGED__DM
LONG_TERM__MOMENTUM__EXP_GDP_rate__EXPORT__HEDGED__EM
LONG_TERM__MOMENTUM__EXP_GDP_rate__EXPORT__HEDGED__FM
DOUBLE__COMBO__COMBO__COMBO__COMBO__DM & EM & FM
DOUBLE__COMBO__COMBO__COMBO__COMBO__DM & EM
DOUBLE__COMBO__COMBO__COMBO__COMBO__DM
DOUBLE__COMBO__COMBO__COMBO_

In [33]:
### TESTING: FACTORS EXTRACTING (PRELIMINARY DATA USED)

df_factor_measure_vectors = pd.read_hdf(str_path_vectors_hdf, key = str_key_efficacy)
df_factor_measure_vectors.to_excel(str_path_vectors_xlsx, merge_cells = False)
df_factor_measure_stats = pd.read_hdf(str_path_efficacy_hdf, key = str_key_efficacy)
df_factor_measure_stats.to_excel(str_path_efficacy_xlsx, merge_cells = False)

In [None]:
### TESTING: DATA PREPARATION

iter_term = 'SHORT_TERM' # 'LONG_TERM' # 'COMBO' # 
iter_algo = 'MOMENTUM' # 'COMBO' # 'MEAN' # 
iter_concept = 'EXP_GDP_rate' # 'COMBO' # 'NO_CONCEPT' # 
iter_eer = 'NEER' # 'REER' # 'COMBO' # 
iter_ret = 'HEDGED' # 'COMBO' # 'LOC' # 'USD' #
iter_region = ['DM', 'EM', 'FM'] # ['DM'] # ['EM'] # ['FM'] #
### Transitional data loading, filtering and concatenating:
str_iter_key = '__'.join([iter_term, iter_algo, iter_concept, iter_eer, iter_ret]) 
print(str_iter_key)
ser_iter_ret = pd.read_hdf(str_path_trans_hdf, key = str_key_trans_ret + '__' + str_iter_key).loc[All, All, iter_region]
ser_iter_ret.name = 'Returns'
ser_iter_mcap = pd.read_hdf(str_path_trans_hdf, key = str_key_trans_mcap + '__' + str_iter_key).loc[All, All, iter_region]
ser_iter_mcap.name = 'Market Cap'
ser_iter_factor = pd.read_hdf(str_path_trans_hdf, key = str_key_trans_factor + '__' + str_iter_key).loc[All, All, iter_region]
ser_iter_factor.name = 'Factor'
### Converting to MatLab regions notification:
#dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM'}
#dict_regions = dict(zip(dict_markets.values(), dict_markets.keys()))
#ser_iter_ret = ser_iter_ret.reset_index('Market').replace(dict_regions).set_index('Market', append = True).squeeze()
#ser_iter_mcap = ser_iter_mcap.reset_index('Market').replace(dict_regions).set_index('Market', append = True).squeeze()
#ser_iter_factor = ser_iter_factor.reset_index('Market').replace(dict_regions).set_index('Market', append = True).squeeze()
#ser_iter_ret.to_excel('Data_Files/Test_Files/Example_EER_Returns.xlsx', merge_cells = False)
#ser_iter_mcap.to_excel('Data_Files/Test_Files/Example_EER_Market_Caps.xlsx', merge_cells = False)
#ser_iter_factor.to_excel('Data_Files/Test_Files/Example_EER_Factor.xlsx', merge_cells = False)
### Multiple factors
#ser_mean_factor = ser_iter_factor
#ser_mom_factor = ser_iter_factor
#df_factors = pd.concat([ser_mean_factor,  ser_mom_factor], axis = 1)
#df_factors.columns = ['Mean', 'Momentum']
#df_factors.to_excel('Data_Files/Test_Files/Example_EER_Factors.xlsx', merge_cells = False)
#df_factors = ser_iter_ret.to_frame().join(ser_iter_mcap).join(df_factors).loc[All, ['Mean', 'Momentum']]
### Multiple returns:
#idx_data_short = pd.date_range('2000-01-01', '2000-04-01', freq = 'BM')
#dict_returns = {}
#for iter_ret in ['LOC', 'USD', 'HEDGED']:
#    ser_iter_ret = pd.read_hdf(str_path_trans_hdf, key = str_key_trans_ret + '__' + str_iter_key).loc[All, All, iter_region]
#    ser_iter_ret.name = iter_ret
#    dict_returns[iter_ret] = ser_iter_ret
#df_returns = pd.concat(dict_ret)    
#df_returns.index.names = ['Type'] + df_returns.index.names[1 : ]

#df_returns.loc[All, idx_data_short, All, All].to_excel('Data_Files/Test_Files/Example_EER_Returns_DF.xlsx', merge_cells = False)
#ser_iter_factor.name = 'Factor LT Mean No Concept'
#ser_iter_factor.loc[idx_data_short, All, All].to_excel('Data_Files/Test_Files/Example_EER_Factor_Filtered.xlsx', merge_cells = False)