In [1]:
### INITIALIZATION

### Importing standard modules and date-special modules:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
### DEFINING WEIGHTED AVERAGE

def weighted_average(ser_to_manage, ser_weight = pd.Series()):
    ### Importing standard modules and date-special modules:
    import numpy as np
    import pandas as pd
    ### Adding equal weights, when weights are absent:
    if (len(ser_weight.index) == 0):
        ser_weight = pd.Series(1, index = ser_to_manage.index)
        ser_weight.name = 'Weight'
    ### Clearing and docking vectors:
    df_to_manage = ser_to_manage.to_frame().join(ser_weight, how = 'left').dropna()
    df_to_manage.columns = ['Data', 'Weight']
    ### Result calculating:
    num_result = np.NaN
    if (len(df_to_manage.index) > 0):
        num_result = df_to_manage['Data'].dot(df_to_manage['Weight']) / sum(df_to_manage['Weight'])      
    
    return num_result

In [3]:
### DEFINING MULTI-STEP STANDARTIZATION FOR SEPARATE SERIES

def multistep_standartize(ser_to_manage, arr_truncate, ser_weight = pd.Series(), reuse_outliers = False, center_result = True, full_result = False):
    ### Importing standard modules and date-special modules:
    import numpy as np
    import pandas as pd      
    ### Arrays of iterations properties:
    arr_mean = []
    arr_std = []
    ### Adding equal weights, when weights are absent:
    if (len(ser_weight.index) == 0):
        ser_weight = pd.Series(1, index = ser_to_manage.index)
        ser_weight.name = 'Weight'    
    ### Workhorse and resulting data vectors initialising:
    ser_data_iter = ser_to_manage.dropna().copy() 
    ser_weight_iter = ser_weight.copy()
    ser_data_full = pd.Series(np.NaN, index = ser_data_iter.index)
    ### Looping by boundaries array:
    for num_bound_iter in arr_truncate:
        ### Clearing and docking vectors:        
        index_iter = ser_data_iter.index.intersection(ser_weight_iter.index)
        ser_data_iter = ser_data_iter[index_iter]
        ser_weight_iter = ser_weight_iter[index_iter] 
        ### Properties calculating and saving:
        num_mean_iter = weighted_average(ser_data_iter, ser_weight_iter)
        num_std_iter = ser_data_iter.std()
        arr_mean.append(num_mean_iter)
        arr_std.append(num_std_iter)
        ser_data_iter = (ser_data_iter - num_mean_iter) / num_std_iter       
        ### Standartizing:
        ser_data_iter[ser_data_iter.abs() >= num_bound_iter] = np.sign(ser_data_iter) * num_bound_iter 
        if not (reuse_outliers):
            ### Saving to result and excluding from further calculations truncated values:     
            ser_data_full.where(ser_data_iter.abs() < num_bound_iter, np.sign(ser_data_iter) * num_bound_iter, inplace = True)
            ser_data_iter = ser_data_iter[ser_data_iter.abs() < num_bound_iter]           
    ### Aggregating result:
    if (reuse_outliers):
        ser_data_full = ser_data_iter
    else:     
        ser_data_full[ser_data_iter.index] = ser_data_iter
    ### Centering result:
    if (center_result):      
        ser_result = ser_data_full - weighted_average(ser_data_full, ser_weight) 
    else:
        ser_result = ser_data_full    
    ### Result output:
    ser_result.name = str(ser_to_manage.name) + '_standartized'
    if (full_result):
        return (ser_result, arr_mean, arr_std)
    else:
        return ser_result

In [4]:
### DEFINING MULTI-STEP STANDARTIZATION BY MARKET FOR SEPARATE SERIES

def ison_standartize(ser_to_manage, arr_truncate, ser_weight = pd.Series(), reuse_outliers = False, center_result = True, full_result = False, within_market = False):
    ### Importing standard modules and date-special modules:
    import numpy as np
    import pandas as pd
    ### Multi-step standartizing:
    if (within_market):
    ### Within market standartizing:
        ser_result = ser_to_manage.groupby(by = 'Market', group_keys = False).apply(multistep_standartize, arr_truncate, ser_weight, 
                                                                                    reuse_outliers, center_result, full_result)
    else:
    ### Full universe standartizing:
        ser_result = multistep_standartize(ser_to_manage, arr_truncate, ser_weight, reuse_outliers, center_result, full_result)
    ### Results output:
    return ser_result

In [5]:
### DEFINING MULTI-STEP STANDARTIZATION BY MARKET FOR FULL FACTOR STACK

def single_factor_standartize(ser_factor, arr_truncate, ser_weight = pd.Series(), reuse_outliers = False, center_result = True, within_market = False):
    ### Defining by date standartizing function:
    def by_date_standartize(df_date, arr_truncate, reuse_outliers, center_result, within_market):
        ### ISON standartizing:
        ser_date = ison_standartize(df_date['Factor'], arr_truncate, df_date['Weight'], reuse_outliers, center_result, False, within_market)
        ser_date = ser_date.reindex(df_date.index)
        ### Result output:
        return ser_date
    ### Importing standard modules and date-special modules:
    import numpy as np
    import pandas as pd
    ### Weights preparing:
    if (len(ser_weight.index) == 0):
        ser_weight = pd.Series(1, index = ser_factor.index)
        ser_weight.name = 'Weight'    
    ### Multi-step standartizing:        
    df_factor = ser_factor.to_frame().join(ser_weight, how = 'left')
    df_factor.columns = ['Factor', 'Weight']
#    ser_result = df_factor.groupby('Date').apply\
#    (lambda iter_df: ison_standartize(iter_df['Factor'], arr_truncate, iter_df['Weight'], reuse_outliers, center_result, False, within_market))
    ser_result = df_factor.groupby('Date', group_keys = False).apply(by_date_standartize, arr_truncate, reuse_outliers, center_result, within_market)
    ### Results output:
    ser_result.name = ser_factor.name
    return ser_result    

In [6]:
### DEFINING GROUP MULTI-STEP STANDARTIZATION BY MARKET FOR FULL FACTOR STACK FOR MULTIPLE FACTORS

def multi_factor_standartize(df_factor, arr_truncate, ser_weight = pd.Series(), reuse_outliers = False, center_result = True, within_market = False):
    
    dict_standartized = {}
    ### Single factor standartizing:
    for iter_factor in df_factor.columns:
        dict_standartized[iter_factor] = single_factor_standartize(df_factor[iter_factor], arr_truncate, ser_weight, 
                                                                   reuse_outliers, center_result, within_market)
    ### Concatenating to dataframe:
    df_result = pd.concat(dict_standartized, axis = 1)
    ### Results output:
    return df_result

In [7]:
### DEFINING EXPONENTIAL WEIGHT

def exp_weight_single(halflife_len = 3, num_element = 0):
    ### Importing standard modules:    
    import numpy as np
    import pandas as pd
    import math     
    ### Weight calculating:
    num_period_factor = math.exp(math.log(0.5) / round(halflife_len))
    num_weight = np.exp(math.log(num_period_factor) * num_element)
    ### Result output:
    return num_weight

In [8]:
### DEFINING EFFICACY MEASURES FOR SINGLE FACTOR

def single_factor_multiple_efficacy_measures(ser_factor, ser_return, ser_weight, arr_measure, return_shift = 0, arr_truncate = [2.5, 2.0]):
    ### Importing standard modules and data-special modules:
    import numpy as np
    import pandas as pd
    import statsmodels.api as sm
    from scipy import stats as ss
    ### Declaring local constants & variables:
    All = slice(None)
    dict_measure = {}
    ### Defining date index shifting function:
    def date_reindex(iter_group, idx_date_range, return_shift = 0):
        ser_iter_result = iter_group.unstack('Country').reindex(idx_date_range).sort_index().shift(-return_shift).stack('Country', dropna = False).squeeze()
        ### Results output:
        return ser_iter_result    
    ### Defining get_measure group level function:
    def get_measure(df_to_measure, iter_measure):
        ### Checking data sufficiency:
        if (len(df_to_measure.dropna().index) > 1):           
            ### Measure calculating:
            if (iter_measure == 'ic_spearman'):
                ### Spearmen information coefficient:
                list_factor = df_to_measure[['Factor', 'Return']].dropna()['Factor'].values
                list_return = df_to_measure[['Factor', 'Return']].dropna()['Return'].values
                num_result = ss.spearmanr(list_factor, list_return, nan_policy = 'omit').correlation
            if (iter_measure == 'ic_pearson'):
                ### Pearson information coefficient:
                list_factor = df_to_measure[['Factor', 'Return']].dropna()['Factor'].values
                list_return = df_to_measure[['Factor', 'Return']].dropna()['Return'].values                
                num_result = ss.pearsonr(list_factor, list_return)[0]
            if (iter_measure == 'fmb_eqw'):
                ### Fama-McBeth cross-sectional regression beta coefficient (equal weighted residuals):
                list_factor_added = df_to_measure[['Factor', 'Constant', 'Return']].dropna()[['Factor', 'Constant']].values
                list_return = df_to_measure[['Factor', 'Constant', 'Return']].dropna()['Return'].values
                wls_model = sm.OLS(endog = list_return, exog = list_factor_added, missing = 'drop', hasconst = False)
                wls_results = wls_model.fit()
                num_result = wls_results.params[0]
            if (iter_measure == 'fmb_weighted'):
                ### Fama-McBeth cross-sectional regression beta coefficient (market capitalization weighted residuals):
                list_factor_added = df_to_measure[['Factor', 'Constant', 'Return', 'Weight']].dropna()[['Factor', 'Constant']].values
                list_return = df_to_measure[['Factor', 'Constant', 'Return', 'Weight']].dropna()['Return'].values                
                list_weight = df_to_measure[['Factor', 'Constant', 'Return', 'Weight']].dropna()['Weight'].values
                wls_model = sm.WLS(endog = list_return, exog = list_factor_added, weights = pow(list_weight, 1/2), missing = 'drop', hasconst = False)
                wls_results = wls_model.fit()
                num_result = wls_results.params[0]
            if (iter_measure == 'fmb_std_eqw'):             
                ### Fama-McBeth cross-sectional regression beta coefficient (market capitalization weighted residuals):
                list_factor_std_added = df_to_measure[['Factor_std', 'Constant', 'Return']].dropna()[['Factor_std', 'Constant']].values
                list_return = df_to_measure[['Factor_std', 'Constant', 'Return']].dropna()['Return'].values                
                wls_model = sm.OLS(endog = list_return, exog = list_factor_std_added, missing = 'drop', hasconst = False)
                wls_results = wls_model.fit()
                num_result = wls_results.params[0]                 
            if (iter_measure == 'fmb_std_weighted'):
                ### Fama-McBeth cross-sectional regression beta coefficient (market capitalization weighted residuals):
                list_factor_std_added = df_to_measure[['Factor_std', 'Constant', 'Return', 'Weight']].dropna()[['Factor_std', 'Constant']].values
                list_return = df_to_measure[['Factor_std', 'Constant', 'Return', 'Weight']].dropna()['Return'].values                
                list_weight = df_to_measure[['Factor_std', 'Constant', 'Return', 'Weight']].dropna()['Weight'].values                
                wls_model = sm.WLS(endog = list_return, exog = list_factor_std_added, weights = pow(list_weight, 1/2), missing = 'drop', hasconst = False)
                wls_results = wls_model.fit()
                num_result = wls_results.params[0]  
            if (iter_measure == 'clp'):
                ### Constant leverage portfolio signed normalized multiplication sum:                
                ser_clp_weighted = df_to_measure[['Factor', 'Return', 'Weight']].dropna()['Factor']
                ser_clp_weighted = ser_clp_weighted * df_to_measure[['Factor', 'Return', 'Weight']].dropna()['Weight'].transform(np.sqrt)
                ser_clp_weighted.loc[ser_clp_weighted < 0] = -ser_clp_weighted / ser_clp_weighted[ser_clp_weighted < 0].sum()
                ser_clp_weighted.loc[ser_clp_weighted > 0] = ser_clp_weighted / ser_clp_weighted[ser_clp_weighted > 0].sum()
                num_result = (ser_clp_weighted * df_to_measure['Return']).sum()
                                  
        else:                          
            num_result = np.NaN
        ### Preparing results: 
        return num_result
    ### Preparing combined vectors for measures calculating:
    if (ser_weight.count() == 0):
        ser_weight = pd.Series(1, index = ser_factor.index)
    ### Region filter dropping:
    ser_factor = ser_factor.reset_index('Market', drop = True)
    ser_return = ser_return.reset_index('Market', drop = True)
    ser_weight = ser_weight.reset_index('Market', drop = True)
    ### Preparing shifted returns:
    idx_date_range = ser_return.index.get_level_values(0).unique()
    ser_return_shifted = ser_return.groupby('Country', group_keys = False).apply(date_reindex, idx_date_range, return_shift)
    ### Preparing combined vectors for measures calculating:
    df_to_measure = pd.concat([ser_factor, ser_return_shifted, ser_weight], axis = 1)
    df_to_measure.columns = ['Factor', 'Return', 'Weight']
    df_to_measure['Constant'] = 1
    ser_factor_std = df_to_measure.dropna()['Factor'].groupby('Date').apply(ison_standartize, arr_truncate = arr_truncate, within_market = False)
    df_to_measure['Factor_std'] = ser_factor_std.reindex(df_to_measure.index)
    ### Looping efficacy measures for calculating measures timeseries:
    for iter_measure in arr_measure:
        dict_measure[iter_measure] = df_to_measure.groupby('Date').apply(get_measure, iter_measure = iter_measure)
    ### Preparing results:
    return pd.concat(dict_measure, axis = 1)

In [9]:
### DEFINING MEASURE STATISTICS CALCULATOR

def measure_stats(df_measures, arr_back_period = [99]):
    ### Importing standard modules and data-special modules:
    import numpy as np
    import pandas as pd
    ### Declaring local constants & variables:
    dict_stats = {}
    ### Stats calculating:
    for iter_measure in df_measures.columns:
        dict_period = {}
        for iter_back_period in arr_back_period:
            ser_iter_measure = df_measures[iter_measure].dropna()
            idx_iter_range = pd.date_range(end = ser_iter_measure.index[-1], periods = iter_back_period * 12, freq = 'BM')
            ser_iter_measure = ser_iter_measure[idx_iter_range]            
            ser_iter_stats = pd.Series()
            ser_iter_stats['count'] = ser_iter_measure.count()
            ser_iter_stats['min'] = ser_iter_measure.min()
            ser_iter_stats['max'] = ser_iter_measure.max()        
            ser_iter_stats['mean'] = ser_iter_measure.mean()
            ser_iter_stats['std'] = ser_iter_measure.std()
            ser_iter_stats['median'] = ser_iter_measure.median()        
            ser_iter_stats['perc_25'] = ser_iter_measure.quantile(0.25, 'midpoint')
            ser_iter_stats['perc_75'] = ser_iter_measure.quantile(0.75, 'midpoint')
            ser_iter_stats['iq_range'] = ser_iter_measure.quantile(0.75, 'midpoint') - ser_iter_measure.quantile(0.25, 'midpoint')
            ser_iter_stats['mean_abs'] = ser_iter_measure.abs().mean()
            ser_iter_stats['t_stat'] = (ser_iter_measure.mean() / ser_iter_measure.std()) * np.sqrt(ser_iter_measure.count())  
            dict_period[iter_back_period] = ser_iter_stats
        dict_stats[iter_measure] = pd.concat(dict_period, axis = 1)
    ### Preparing results:
    return pd.concat(dict_stats, axis = 1)

In [10]:
### DEFINING SPECIAL CLP STATS

def special_clp_stats(ser_factor, ser_return, ser_weight, return_shift = 0):
    ### Importing standard modules and data-special modules:
    import numpy as np
    import pandas as pd
    ### Declaring local constants & variables:    
    dict_clp_stats = {}
    list_bin_labels = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']
    ### Defining date index shifting function:
    def date_reindex(iter_group, idx_date_range, return_shift = 0):
        ser_iter_result = iter_group.unstack('Country').reindex(idx_date_range).sort_index().shift(-return_shift).stack('Country', dropna = False).squeeze()
        ### Results output:
        return ser_iter_result    
    ### Defining function for constant leverage portfolio normalizing:
    def get_normalized_clp(df_to_measure):
        ser_clp_weighted = df_to_measure['Factor'] * df_to_measure['Weight'].transform(np.sqrt)
        ### Checking data sufficiency:
        if (ser_clp_weighted.count() > 0):           
            ### Constant leverage portfolio signed normalized:
            ser_clp_weighted.loc[ser_clp_weighted < 0] = -ser_clp_weighted / ser_clp_weighted[ser_clp_weighted < 0].sum()
            ser_clp_weighted.loc[ser_clp_weighted > 0] = ser_clp_weighted / ser_clp_weighted[ser_clp_weighted > 0].sum()
            ser_result = ser_clp_weighted.copy()
        else:
            ser_result = pd.Series(np.NaN, index = ser_clp_weighted.index)
        ### Results output:
        return ser_result
    ### Defining function for constant leverage portfolio normalizing:
    def get_normalized_factor(df_to_measure):
        ser_clp_weighted = df_to_measure['Factor'] * df_to_measure['Weight'].transform(np.sqrt)
        ### Checking data sufficiency:
        if (ser_clp_weighted.count() > 0):           
            ### Factor signed normalized:
            ser_factor_normalized = df_to_measure['Factor']
            ser_factor_normalized.loc[ser_factor_normalized < 0] = -ser_factor_normalized / ser_factor_normalized[ser_factor_normalized < 0].sum()
            ser_factor_normalized.loc[ser_factor_normalized > 0] = ser_factor_normalized / ser_factor_normalized[ser_factor_normalized > 0].sum()
            ser_result = ser_factor_normalized
        else:
            ser_result = pd.Series(np.NaN, index = ser_clp_weighted.index) 
        ### Results output:
        return ser_result            
    ### Defining function for returns for constant leverage portfolio:
    def get_normalized_return(df_to_measure):
        ser_clp_weighted = df_to_measure['Factor'] * df_to_measure['Weight'].transform(np.sqrt)
        ### Checking data sufficiency:
        if (ser_clp_weighted.count() > 0):           
            ### Normalized return:  
            ser_result = df_to_measure['Return']
        else:
            ser_result = pd.Series(np.NaN, index = ser_clp_weighted.index)            
        ### Results output:
        return ser_result  
    ### Defining MatLab style prctile function:
    def prctile(ser_to_perc, p):
        ### Sorted list preparing:
        list_to_perc = ser_to_perc.dropna().values
        list_sorted = np.sort(list_to_perc)
        ### Length calculating:
        num_len = len(list_to_perc)    
        ### Prctile calculating:
        num_result = np.interp(np.array(p), np.linspace(1 / (2 * num_len), (2 * num_len - 1) / (2 * num_len), num_len), list_sorted)
        ### Results output:
        return num_result
    ### Defining quintile bins distribution:
    def quartile_distribution(ser_iter_group):
        ### Bins preparing:
        list_bin = list(np.arange(0, 100, 20))[1 : ]
        list_bin = [round(iter_element / 100, 2) for iter_element in list_bin]
        list_bin = [prctile(ser_iter_group, iter_element) for iter_element in list_bin]
        list_bin = [ser_iter_group.min() - abs(ser_iter_group.min())] + list_bin + [ser_iter_group.max() + abs(ser_iter_group.max())]
        ### Bins distribution:
        ser_iter_distribution = pd.cut(ser_iter_group, bins = list_bin, labels = list_bin_labels)    
        ### Results output:
        return ser_iter_distribution   
    ### Preparing combined vectors for measures calculating:
    if (ser_weight.count() == 0):
        ser_weight = pd.Series(1, index = ser_factor.index)
    ### Region filter dropping:
    ser_factor = ser_factor.reset_index('Market', drop = True)
    ser_return = ser_return.reset_index('Market', drop = True)
    ser_weight = ser_weight.reset_index('Market', drop = True)      
    ### Preparing shifted returns:
    idx_date_range = ser_return.index.get_level_values(0).unique()
    ser_return_shifted = ser_return.groupby('Country', group_keys = False).apply(date_reindex, idx_date_range, return_shift)
    ### Preparing combined vectors for measures calculating:
    df_to_measure = pd.concat([ser_factor, ser_return_shifted, ser_weight], axis = 1)
    df_to_measure.columns = ['Factor', 'Return', 'Weight']
    ser_clp_normalized = df_to_measure.dropna().groupby('Date', group_keys = False).apply(get_normalized_clp)
    ser_factor_normalized = df_to_measure.dropna().groupby('Date', group_keys = False).apply(get_normalized_factor)
    ser_return_normalized = df_to_measure.dropna().groupby('Date', group_keys = False).apply(get_normalized_return)
    ### CLP stats calculating:
    dict_clp_stats['Average Bias'] = ser_factor_normalized.groupby('Country').mean()
#    dict_clp_stats['Temp'] = pd.Series(np.NaN, index = dict_clp_stats['Average Bias'].index)
    dict_clp_stats['Weights Sum'] = ser_clp_normalized.groupby('Country').sum()
    dict_clp_stats['Average Return'] = ser_return_normalized.groupby('Country').mean()   
    dict_clp_stats['Static Contribution'] = dict_clp_stats['Weights Sum'] * dict_clp_stats['Average Return']
    dict_clp_stats['Total Contribution'] = (ser_clp_normalized * ser_return_normalized).groupby('Country').sum()
    dict_clp_stats['Dynamic Contribution'] = dict_clp_stats['Total Contribution'] - dict_clp_stats['Static Contribution'] 
    ### CLP active weights calculating:
    ser_clp_delta = ser_clp_normalized.unstack('Date').stack('Date', dropna = False).swaplevel().sort_index(level = ['Date', 'Country'])
    ser_clp_delta = ser_clp_delta.fillna(0)
    num_clp_mean = ser_clp_delta.groupby('Country').mean().abs().sum()
    ser_clp_delta = ser_clp_delta.groupby('Country').apply(lambda iter_group: iter_group - iter_group.mean())
    num_clp_delta = (ser_clp_delta.abs().groupby('Date').sum() / (ser_clp_delta.abs().groupby('Date').sum() + num_clp_mean)).mean()
    df_clp_stats = pd.concat(dict_clp_stats, axis = 1).reindex(df_to_measure.index.get_level_values('Country').unique()).sort_index()
    ### Preparing sum:
    df_clp_sum = pd.DataFrame([[np.NaN, np.NaN, np.NaN, df_clp_stats['Static Contribution'].sum(), 
                               df_clp_stats['Total Contribution'].sum(), df_clp_stats['Dynamic Contribution'].sum()]], 
                              index = ['Sum'], columns = df_clp_stats.columns)
    ### Preparing expected based on active weights:
    df_clp_expected = pd.DataFrame([[np.NaN, np.NaN, np.NaN, 
                                     df_clp_stats['Total Contribution'].sum() * (1 - num_clp_delta), np.NaN, df_clp_stats['Total Contribution'].sum() * num_clp_delta]], 
                              index = ['Expected based on active weights =>'], columns = df_clp_stats.columns)    
    ### Adding totals:
    df_clp_stats = pd.concat([df_clp_stats, df_clp_sum, df_clp_expected], axis = 0, join = 'inner')
    ### CLP Bias calculating:   
    ser_clp_quintile = ser_clp_normalized.groupby('Date', group_keys = False).apply(quartile_distribution)
    df_clp_bias = ser_clp_quintile.to_frame()  
    df_clp_bias.columns = ['Bin']
    df_clp_bias['Quintile'] = 1
    df_clp_bias = df_clp_bias.set_index('Bin', append = True).unstack('Bin').fillna(0).droplevel(level = 0, axis = 1)
    df_clp_bias.columns = list(df_clp_bias.columns)  
    df_clp_bias = df_clp_bias[list_bin_labels]
    df_clp_bias = df_clp_bias.groupby('Country').mean()
    df_clp_bias.loc[:, 'Q5 - Q1'] = df_clp_bias['Q5'] - df_clp_bias['Q1']   
    df_clp_bias = df_clp_bias.reindex(df_to_measure.index.get_level_values('Country').unique()).sort_index()
    df_clp_bias = df_clp_bias
    ### Output results:
    return (df_clp_stats, df_clp_bias)

In [11]:
### DEFINING SINGLE EFFICACY MEASURE FOR MULTIPLE FACTORS
    
def multiple_factor_single_efficacy_measure_stats(df_factors, ser_return, ser_weight, str_measure, num_back_period = 99, 
                                                  num_horizon = 12, list_region_xmo = ['DM', 'EM', 'FM']): 
    ### Importing standard modules and data-special modules:
    import numpy as np
    import pandas as pd
    ### Declaring local constants & variables:
    All = slice(None)
    list_months = [1, 2, 3, 6, 9 ,12]
    ### Defining full universe expanding for date:
    def universe_reindex(iter_group, idx_universe):
        df_iter_result = iter_group.unstack('Date').reindex(idx_universe).sort_index().stack('Date', dropna = False)
        ### Results output:
        return df_iter_result   
    ### Defining date index shifting function:
    def date_reindex(iter_group, idx_date_range, return_shift = 0):
        ser_iter_result = iter_group.unstack('Country').reindex(idx_date_range).sort_index().shift(-return_shift).stack('Country', dropna = False).squeeze()
        ### Results output:
        return ser_iter_result   
    ### Defining by date correrlation function:
    def corr_by_date(iter_group):
        num_iter_corr = iter_group['Corr_factor_minus'].corr(iter_group['Corr_factor_plus'])
        ### Results output:
        return num_iter_corr       
    ### Preparing expanded universe for autocorrelation performing:
    df_factors_region = df_factors.loc[(All, All, list_region_xmo), :]
    idx_date_range = df_factors_region.index.get_level_values(0).unique()
    idx_universe = df_factors_region.index.get_level_values(1).unique()
    df_factors_full = df_factors_region.reset_index('Market', drop = True).groupby('Date', group_keys = False).apply(universe_reindex, idx_universe).swaplevel()
    ### Factors looping:
    dict_factors_measures = {} ### Container for all factor stats
    dict_factors_autocorr = {} ### Container for autocorrelation results
    for iter_factor in df_factors.columns:
        ### Shifts looping for factors measures stats:
        ### Stats calculation:
        dict_factor_stats = {} ### Container for iterated factor stats
        for iter_shift in range(num_horizon):
#            df_factor_filtered = df_factors[iter_factor].loc[All, All, list_region_xmo]
            df_factor_filtered = df_factors_region[iter_factor]
            df_iter_shift_measure = single_factor_multiple_efficacy_measures(df_factor_filtered, ser_return, ser_weight, [str_measure], iter_shift, list_truncate)
            df_iter_shift_stats = measure_stats(df_iter_shift_measure, [num_back_period])
            dict_factor_stats[iter_shift] = df_iter_shift_stats.loc[['mean', 't_stat'], (str_measure, num_back_period)]
        df_iter_factor_stats = pd.concat(dict_factor_stats, axis = 1)
        df_iter_factor_stats.columns = df_iter_factor_stats.columns + 1
        dict_factors_measures[iter_factor] = df_iter_factor_stats
        ### Autocorrelation calculation:
        ser_iter_factor = df_factors_full[iter_factor]
        ser_iter_factor_plus = ser_iter_factor.groupby('Country', group_keys = False).apply(lambda iter_group: iter_group.iloc[1 : ]).\
                                sort_index(level = ['Date', 'Country'])
        ser_iter_factor_minus = ser_iter_factor.groupby('Country', group_keys = False).apply(lambda iter_group: iter_group.iloc[: -1]).\
                                sort_index(level = ['Date', 'Country'])
        ### Artificial series combining for indexes synchronization:        
        ser_iter_factor_plus_shifted = ser_iter_factor_plus.groupby('Country', group_keys = False).apply(date_reindex, idx_date_range, 1)
        df_iter_factor_to_corr = pd.concat([ser_iter_factor_minus, ser_iter_factor_plus_shifted], axis = 1)
        df_iter_factor_to_corr.columns = ['Corr_factor_minus', 'Corr_factor_plus']
        dict_factors_autocorr[iter_factor] = pd.Series(df_iter_factor_to_corr.groupby('Date').apply(corr_by_date).mean(), index = ['autocorr'])
    ### Results output:
    df_factors_measures_stats = pd.concat(dict_factors_measures, axis = 0)
    df_factors_autocorr =  pd.concat(dict_factors_autocorr, axis = 1).transpose()
    df_factors_coeff = df_factors_measures_stats.loc[(All, 'mean'), list_months].reset_index(1, drop = True)
    df_factors_coeff.columns = [('coeff_' + str(iter_column)) for iter_column in df_factors_coeff.columns]
    df_factors_t_stat = df_factors_measures_stats.loc[(All, 't_stat'), list_months].reset_index(1, drop = True)
    df_factors_t_stat.columns = [('t_' + str(iter_column)) for iter_column in df_factors_t_stat.columns]
    df_factors_result = pd.concat([df_factors_autocorr, df_factors_coeff, df_factors_t_stat], axis = 1)    
    return (df_factors_result, df_factors_measures_stats)

In [12]:
### DEFINING EXTRACTION UNIVERSE DATA FROM GENERAL MS EXCEL SOURCE

def get_market_membership_from_excel():
    ### Importing standard modules and date-special modules:
    import numpy as np
    import pandas as pd
    ### Declaring local constants & variables: 
    path_msci = 'Data_Files/Source_Files/sample_universe.xlsx' ### Path for membership source    
    tab_monthly = 'universe_joined'    
    arr_markets_needed = ['DM', 'FM', 'EM']   
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM'}
    no_slice = slice(None)
    ### Extracting universe data:
    df_universe = pd.read_excel(io = path_msci, sheet_name = tab_monthly, skiprows = [0, 2], header = 0, parse_dates = True, 
                                na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    df_universe = df_universe.loc[no_slice, ['dates', 'region', 'ctry']]
    df_universe.columns = ['Date', 'Market', 'Country']
    df_universe.set_index(['Date', 'Country'], inplace = True)
    ser_universe = df_universe.squeeze()
    ser_universe.sort_index(level = [0, 1], inplace = True)
    ser_universe.replace(dict_markets, inplace = True)
    ser_market_membership = ser_universe[ser_universe.isin(arr_markets_needed)]
    ### Results output:
    return ser_market_membership

In [19]:
### INITIALIZATION

### Declaring global constants & variables: 
path_cds_result_hdf = 'Data_Files/Source_Files/CDS_Result.h5' ### Path to blended CDS index
key_cds_result = 'cds_blended' ### blended CDS object
path_market_cap = 'Data_Files/Source_Files/Market_Cap.h5'
key_market_cap = 'mcap'
path_return = 'Data_Files/Source_Files/Returns_Integrated.h5'
key_return = 'returns'
path_growth_potential_hdf = 'Data_Files/Source_Files/OD_Growth_Potential.h5'
key_growth_potential = 'growth_potential'
float_zero_tolerance = 0.001 ### For float comparision with 0
All = slice(None) ### No-slice for MultiIndex level for using in loc method
tup_plt_size = (20, 5) ### Plot sizes
list_truncate = [2.5, 2.0] ### Truncate boundaries for standartization
list_back_period = [99, 10, 5]
### Using transitional results:
path_transitional_results = 'Data_Files/Test_Files/Factor_transitionals.h5'
key_factors = 'factors'
key_return = 'returns'
key_mcap = 'mcaps'
df_factor_filtered = pd.read_hdf(path_transitional_results, key_factors)
ser_return_filtered = pd.read_hdf(path_transitional_results, key_return)
ser_mcap_filtered = pd.read_hdf(path_transitional_results, key_mcap)
### CDS with ISON and market caps and earlier calculated data reading:
#ser_cds_res = pd.read_hdf(path_cds_result_hdf, key_cds_result)
#ser_market_membership = get_market_membership_from_excel()
#ser_mcap = pd.read_hdf(path_market_cap, key_market_cap)
#ser_return = pd.read_hdf(path_return, key_return)
#df_growth_potential = pd.read_hdf(path_growth_potential_hdf, key_growth_potential).iloc[All, : 6]

In [None]:
### COMPARING WITH MATLAB: EXCEL FILES PREPARING

### Returns shifting & saving:
ser_return_shifted = ser_return.groupby('Country').shift(periods = -1)
ser_return_shifted.name = 'Return_shifted'
ser_return_shifted_ison = ser_return_shifted.loc[All, All, ['DM', 'EM', 'FM']]
ser_return_shifted_ison = ser_return_shifted_ison.reset_index('Market').replace(['DM', 'EM', 'FM'], [50, 57, 504]).set_index('Market', append = True)
#ser_return_shifted_ison.to_excel('Data_Files/Test_Files/Example_Returns.xlsx')
### Market Caps saving:
ser_mcap_ison = ser_mcap.loc[All, All, ['DM', 'EM', 'FM']]
ser_mcap_ison = ser_mcap_ison.reset_index('Market').replace(['DM', 'EM', 'FM'], [50, 57, 504]).set_index('Market', append = True)
#ser_mcap_ison.to_excel('Data_Files/Test_Files/Example_Market_Caps.xlsx')
### Factors saving:
df_growth_potential_ison = df_growth_potential.loc[(All, All, ['DM', 'EM', 'FM']), All]
df_growth_potential_ison = df_growth_potential_ison.reset_index('Market').replace(['DM', 'EM', 'FM'], [50, 57, 504]).set_index('Market', append = True)
#df_growth_potential_ison.to_excel('Data_Files/Test_Files/Example_OD_Growth_Potential.xlsx')

In [None]:
### DATA PREPARING

### Parameters:
list_region = ['DM', 'EM', 'FM'] # ['DM'] # ['DM', 'EM'] ### Regions list
list_countries_to_exclude = ['VE'] # ['GR', 'UA', 'VE'] ### Countries not to play the game
bool_within_market = True # Standartization way
str_date_start = '2007-01-01' # Start date to filter returns, market caps and factors
idx_date_range = pd.date_range(str_date_start, periods = 999, freq = 'BM')
### Returns shifting:
ser_return_prepared = ser_return.groupby('Country').shift(periods = -1)
### Region and date clearing:
ser_return_prepared = ser_return_prepared.loc[idx_date_range, All, list_region]
ser_mcap_prepared = ser_mcap.loc[idx_date_range, All, list_region]
df_factor_prepared = df_growth_potential.loc[(idx_date_range, All, list_region), All]
### Countries filtering:
ser_return_prepared = ser_return_prepared.drop(list_countries_to_exclude, level = 'Country')
ser_mcap_prepared = ser_mcap_prepared.drop(list_countries_to_exclude, level = 'Country')
df_factor_prepared = df_factor_prepared.drop(list_countries_to_exclude, level = 'Country')

In [15]:
### MULTI FACTORS STANDARTIZING

df_factor_std = multi_factor_standartize(df_factor_prepared, list_truncate, within_market = bool_within_market)

In [16]:
### COMPARING WITH MATLAB: MULTI FACTORS STANDARTIZING

df_factor_std['Education'].reset_index('Market', drop = True).unstack('Date').to_excel('Data_Files/Test_Files/Test_PY_education_std.xlsx')

In [17]:
### STANDARTIZED FACTORS FILTERING

list_region_filter = ['DM'] #['DM', 'EM', 'FM'] ### Regions filter
df_factor_filtered = df_factor_std.loc[(All, All, list_region_filter), :]
ser_return_filtered = ser_return_prepared.loc[All, All, list_region_filter]
ser_mcap_filtered = ser_mcap_prepared.loc[All, All, list_region_filter]

In [31]:
### COMPARING WITH MATLAB: MULTI FACTORS FILTERING

df_factor_filtered['High-Tech Export License Fee'].reset_index('Market', drop = True).unstack('Date').sort_index(level = 'Country').\
to_excel('Data_Files/Test_Files/Test_PY_high_tech_filtered.xlsx')

In [18]:
### SAVING TRANSITIONAL RESULTS

#df_factor_filtered.to_hdf(path_transitional_results, key_factors, 'w')
#ser_return_filtered.to_hdf(path_transitional_results, key_return)
#ser_mcap_filtered.to_hdf(path_transitional_results, key_mcap)

In [None]:
###########################################################################################################################################################################

In [15]:
### ADDITIONAL FILTERING FOR TESTING PURPOSES

list_region_filter = ['EM', 'FM'] ### Regions filter
df_factor_filtered = df_factor_filtered.loc[(All, All, list_region_filter), :]
ser_return_filtered = ser_return_filtered.loc[All, All, list_region_filter]
ser_mcap_filtered = ser_mcap_filtered.loc[All, All, list_region_filter]

In [16]:
### COMPARING WITH MATLAB: PERFORMANCE TEST

### Measures to calculate:
arr_measure = ['ic_pearson', 'ic_spearman', 'fmb_eqw', 'fmb_weighted', 'fmb_std_eqw', 'fmb_std_weighted', 'clp']
### Measures comparing:
df_test_perf_measure = single_factor_multiple_efficacy_measures(df_factor_filtered['Education'], ser_return_filtered, ser_mcap_filtered, 
                                 arr_measure, return_shift = 0, arr_truncate = list_truncate)
df_test_perf_measure.to_excel('Data_Files/Test_Files/Test_PY_edu_measures.xlsx')
### Measure stats comparing:
df_test_perf_stats = measure_stats(df_test_perf_measure, list_back_period)
df_test_perf_stats.to_excel('Data_Files/Test_Files/Test_PY_edu_stats.xlsx')
### CLP tables comparing:
(df_test_clp_stats, df_test_clp_bias) = \
special_clp_stats(df_factor_filtered['Education'], ser_return_filtered, ser_mcap_filtered, return_shift = 0)
df_test_clp_stats.to_excel('Data_Files/Test_Files/Test_PY_edu_clp_stats.xlsx')
df_test_clp_bias.to_excel('Data_Files/Test_Files/Test_PY_edu_clp_bias.xlsx')
### Multiple factors comparing:
(df_factors_result, df_factors_measures_stats) = multiple_factor_single_efficacy_measure_stats(\
                                                                    df_factor_filtered[['Education', 'High-Tech Export License Fee', 'Human Resource Potential']], 
                                                                    ser_return_filtered, ser_mcap_filtered, 'fmb_std_weighted', list_back_period[0], 12)
df_factors_result.to_excel('Data_Files/Test_Files/Test_PY_multiple_result.xlsx')
df_factors_measures_stats.to_excel('Data_Files/Test_Files/Test_PY_multiple_fmb_std.xlsx')
### Multiple xmo factors comparing:
list_region_xmo = ['FM'] ### Regions filter
(df_factors_result, df_factors_measures_stats) = multiple_factor_single_efficacy_measure_stats(\
                                                                    df_factor_filtered[['Education', 'High-Tech Export License Fee', 'Human Resource Potential']], 
                                                                    ser_return_filtered, ser_mcap_filtered, 'fmb_std_weighted', list_back_period[0], 12, list_region_xmo)
df_factors_result.to_excel('Data_Files/Test_Files/Test_PY_multiple_xmo_result.xlsx')
df_factors_measures_stats.to_excel('Data_Files/Test_Files/Test_PY_multiple_xmo_fmb_std.xlsx')

In [17]:
### CONTROL TEST



Date        Country  Market
2007-01-31  AE       FM        0.023936
2007-02-28  AE       FM       -0.062659
2007-03-30  AE       FM        0.046285
2007-04-30  AE       FM        0.170291
2007-05-31  AE       FM       -0.022307
                                 ...   
2019-05-31  ZM       FM             NaN
2019-06-28  ZM       FM             NaN
2019-07-31  ZM       FM             NaN
2019-08-30  ZM       FM             NaN
2019-09-30  ZM       FM             NaN
Name: Return, Length: 12659, dtype: float64

In [19]:
### TEMP 



Unnamed: 0_level_0,Unnamed: 1_level_0,Market,Return
Date,Country,Unnamed: 2_level_1,Unnamed: 3_level_1


In [20]:
### TEMP



In [21]:
### TEMP



In [64]:
### TEMP

