In [1]:
### THA STANDARTIZE PLAYGROUND ACADIAN MODE

In [2]:
### MODULES IMPORT (PART OF THE PRODUCT CODE)

import pandas as pd
import numpy as np
from datetime import date, datetime
import math
import matplotlib.pyplot as plt

In [3]:
## VERSION CONTROL

from platform import python_version
print('pandas version: ', pd.__version__)
print('python version: ', python_version())

pandas version:  0.25.3
python version:  3.7.4


In [4]:
### INTERNAL PARAMETERS INITIALIZATION (TO BE IGNORED IN PRODUCT CODE)

import os ### To work with csv files

### Universe path:
str_path_universe = 'Data_Files/Source_Files/acadian_universe.xlsx'
### PRS data:
str_path_prs_hdf = 'Data_Files/Source_Files/Country_Risks/PRS_loaded.h5'
str_key_prs_pillars_only_converted = 'prs_pillars_only_converted'
### Continuum data:
str_path_continuum_hdf = 'Data_Files/Source_Files/Country_Risks/Continuum_loaded.h5'
str_key_continuum_politics_converted = 'continuum_politics_converted'
### Test Sources weights:
str_path_weights_xlsx = 'Data_Files/Test_Files/Test_Weights.xlsx'
### General daily-mode ranges parameters:
str_source_date_start = '1992-01-01' ### Start date for source vectors
str_measure_date_start = '1996-08-01' ### Start date for efficacy measures
str_ison_date_start = '1994-01-31' ### Start date for ISON Universe
str_measure_date_end = '2020-08-31' ### End date for efficacy measures
idx_source_date_range = pd.date_range(str_source_date_start, str_measure_date_end, freq = 'B') ### Range for source data filtering
idx_test_monthly_date_range = pd.date_range(str_ison_date_start, str_measure_date_end, freq = 'BM') ### Range for source data filtering
idx_test_daily_date_range = pd.date_range(str_ison_date_start, str_measure_date_end, freq = 'B') ### Range for source data filtering
idx_factor_date_range = pd.date_range(str_source_date_start, str_measure_date_end, freq = 'BM') ### Range for factor data filtering
idx_measure_date_range = pd.date_range(str_measure_date_start, str_measure_date_end, freq = 'BM') ### Range for measures calculation
### Results saving:
str_test_factor_full_csv = 'Data_Files/Test_Files/acadian_mode_test_factor_full.csv'
str_test_autocorr_csv = 'Data_Files/Test_Files/acadian_mode_test_autocorr.csv'
str_test_factor_source_csv = 'Data_Files/Test_Files/acadian_mode_test_factor_source.csv'
str_test_factor_agg_csv = 'Data_Files/Test_Files/acadian_mode_test_factor_agg.csv'
str_test_factor_res_xlsx = 'Data_Files/Test_Files/acadian_mode_test_factor_res.xlsx'

In [5]:
### GENERAL PARAMETERS INITIALIZATION (PART OF THE PRODUCT CODE)

### Common constants:
All = slice(None)

### ISON filtering options:
list_ison = ['DM', 'EM', 'FM'] ### Regions filter to drop NaN region values
list_countries_to_exclude = ['VE'] ### Countries not to play the game

In [6]:
### DEFINING EXTRACTION UNIVERSE DATA FROM MS EXCEL SOURCE (TO BE IGNORED IN PRODUCT CODE)

def ison_membership_converting(str_path_universe, date_end, bool_daily = False, int_backfill_months = 0):
    ### Defining business-month-end reindexation on country level:
    def country_modify(ser_raw_country, date_end):
        ser_res_country = ser_raw_country.droplevel(0).resample('MS').last().resample('BM').last()
        range_country = pd.date_range(ser_res_country.index[0], date_end, freq = 'BM')
        return ser_res_country.reindex(range_country).ffill()
    ### Markets encoding table:
    dict_markets = {50 : 'DM', 57 : 'EM', 504 : 'FM', 0: np.NaN}     
    ### Loading source file:
    df_raw_universe = pd.read_excel(io = str_path_universe, sheet_name = 0, header = 0, parse_dates = True, index_col = [0, 1],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)
    ### Converting source file:
    df_raw_universe.index.names = ['Country', 'Date']
    ser_raw_universe = df_raw_universe['Region']
    ser_raw_universe.fillna(0, inplace = True)
    ser_raw_universe.name = 'Market'
    ### By country reindexation and translation:
    ser_res_universe = ser_raw_universe.groupby('Country').apply(country_modify, date_end)
    ser_res_universe.index.names = ['Country', 'Date']
    ser_res_universe = ser_res_universe.replace(dict_markets).reorder_levels([1, 0]).sort_index() 
    ### Expanding membership for primary regions members by backfilling:
    if int_backfill_months:
        ### List of regions:
        list_region = list(ser_res_universe.dropna().unique())
        ### Initialising of collection of series with backfilled data for each region:
        list_ison_backfill = []
        ### Regions looping:
        for iter_region in list_region:
            ### Defining start of region date:
            date_first_valid = ser_res_universe.loc[ser_res_universe == iter_region].first_valid_index()[0]
            ### Creating dates index to backfilling:
            idx_date_backfill = pd.date_range(end = date_first_valid, periods = int_backfill_months + 1, freq = 'BM')[: -1]
            ### Creating primary countries index to backfilling:            
            idx_region_backfill = ser_res_universe.loc[ser_res_universe == iter_region].loc[date_first_valid, All].index.get_level_values('Country')
            ### Creating full index:
            idx_ison_backfill = pd.MultiIndex.from_product([idx_date_backfill, idx_region_backfill])
            ### Series with backfilled data:
            list_ison_backfill.append(pd.Series(iter_region, index = idx_ison_backfill))
        ### Combination of backfilled series and original ISON data:    
        ser_res_universe = ser_res_universe.combine_first(pd.concat(list_ison_backfill, axis = 0)).sort_index()  
        ser_res_universe.index.names = ['Date', 'Country']
    ### Converting to daily frequency:
    if bool_daily:
        ser_res_universe = ser_res_universe.reset_index('Country').groupby('Country').resample('B').ffill()['Market'].swaplevel().sort_index()    
    ### Results output:
    ser_res_universe.name = 'Market'
    return ser_res_universe

In [7]:
### DEFINING EXPONENTIAL WEIGHT (PART OF THE PRODUCT CODE)

def exp_weight_single(halflife_len = 3, num_element = 0):
    ### Weight calculating:
    num_period_factor = math.exp(math.log(0.5) / round(halflife_len))
    num_weight = math.exp(math.log(num_period_factor) * num_element)
    ### Result output:
    return num_weight

In [8]:
### DEFINING WEIGHTED AVERAGE (PART OF THE PRODUCT CODE)

def weighted_average(ser_data, ser_weight = False, int_min_count = 0):
    ### Default output:
    num_result = np.NaN
    ### Checking for data presence:
    if (ser_data.count() > int_min_count):       
        ### Checking for weights dataset:
        if isinstance(ser_weight, bool):
            ### Calculating of simple average:
            num_result = np.nanmean(ser_data.values)
        else:
            ### Weights filtering:
            list_weight = ser_weight[ser_data.dropna().index].values
            ### Checking for weights presence:
            if np.nansum(list_weight):
                ### Data filtering:
                list_data = ser_data.dropna().values
                ### Weighted average calculating:
                num_result = np.nansum(list_data * list_weight) / np.nansum(list_weight)
    ### Results output:
    return num_result

In [9]:
### DEFINING WEIGHTED AVERAGE FOR DATAFRAME COLUMNS (PART OF THE PRODUCT CODE)

def columns_average(df_series, list_weights = False):
    ### Equal weights list creating:
    if isinstance(list_weights, bool):
        list_weights = [1] * len(df_series.columns)
    ### Dataframe of weights initialising:
    df_weights = pd.DataFrame([list_weights] * len(df_series.index), index = df_series.index, columns = df_series.columns)
    ### Zeroing weights for NaN values:
    for iter_col in df_weights.columns:
        df_weights.loc[df_series[iter_col].isna(), iter_col] = 0
    ### Weighted mean calulating:
    ser_means = ((df_series * df_weights).sum(axis = 1) / df_weights.sum(axis = 1))
    ### Results output:
    return ser_means

In [10]:
### DEFINING ACADIAN STYLE TWO-STEP FACTOR VECTOR STANDARTIZATION FOR REGION WITHIN CROSS-SECTION "SINGLE DATE / SINGLE REGION" PDF VERSION (TO BE IGNORED IN PRODUCT CODE)

def td_two_stage_standardize(ser_factor):
    ### Limits definition:
    flo_trunc_limit_1 = 2.5
    flo_trunc_limit_2 = 2.0
    ### Preliminary statistics calculation:
    flo_std = np.nanstd(ser_factor, axis = 0, ddof = 1)
    flo_mean = np.nanmean(ser_factor, axis = 0)
    ### Preliminary z-scoring:
    ser_score = (ser_factor - flo_mean) / flo_std
    ### Constant vector checking:
    if np.isclose(flo_std, 0.0):
        ser_score = ser_factor - ser_factor
    ### First winsorization step:
    ser_score.loc[ser_score < (-1.0 * flo_trunc_limit_1)] = -1.0 * flo_trunc_limit_1
    ser_score.loc[ser_score > flo_trunc_limit_1] = flo_trunc_limit_1
    ### First limit precence marker:
    ser_on_limit = (ser_score.abs() == flo_trunc_limit_1)
    ### Check if first step do some truncation:    
    if ser_on_limit.any():
        ### Under the limit values marking:
        ser_off_limit = (ser_score.abs() != flo_trunc_limit_1)
        ### Separating truncated values to perform further transformations with under the limit values only:
        ser_score_trunc_1 = ser_score.copy()
        ser_score_trunc_1.loc[ser_off_limit] = 0.0
        ### Dropping truncaterd values for further performance:
        ser_score.loc[ser_on_limit] = np.NaN
        ### Repeated statistics calculation:
        flo_std = np.nanstd(ser_score, axis = 0, ddof = 1)
        flo_mean = np.nanmean(ser_score, axis = 0)
        ### Constant vector checking:
        if np.isclose(flo_std, 0.0):
            ser_score = ser_score - ser_score
        else:
            ### Second z-scoring:
            ser_score = (ser_score - flo_mean) / flo_std
        ### Dropping truncaterd values for further performance:
        ser_score.loc[ser_on_limit] = np.NaN
        ### Second winsorization step:  
        ser_score.loc[ser_score < (-1.0 * flo_trunc_limit_2)] = -1.0 * flo_trunc_limit_2
        ser_score.loc[ser_score > flo_trunc_limit_2] = flo_trunc_limit_2
        ### Preparing for truncated values adding:
        ser_score.loc[ser_on_limit] = 0.0
        ### Vectors union:
        ser_score = ser_score + ser_score_trunc_1
        ### Final demean:
        flo_mean = np.nanmean(ser_score, axis = 0)  
        ser_score = ser_score - flo_mean
    ### Results output:
    return ser_score

In [11]:
### DEFINING ACADIAN STYLE TWO-STEP FACTOR VECTOR STANDARTIZATION FOR REGION WITHIN CROSS-SECTION "SINGLE DATE / SINGLE REGION" EXTENDED (TO BE IGNORED IN PRODUCT CODE)

def td_two_stage_standardize_extended(ser_factor):
    ### Limits definition:
    flo_trunc_limit_1 = 2.5
    flo_trunc_limit_2 = 2.0
    ### Preliminary statistics calculation:
    flo_std_1 = np.nanstd(ser_factor, axis = 0, ddof = 1)
    flo_mean_1 = np.nanmean(ser_factor, axis = 0)
    ### Constant vector checking:
    if np.isclose(flo_std_1, 0.0):
        ser_score = ser_factor - ser_factor
    else:
        ### First z-scoring:
        ser_score = (ser_factor - flo_mean_1) / flo_std_1
    ### First winsorization step:
    ser_score.loc[ser_score < (-1.0 * flo_trunc_limit_1)] = -1.0 * flo_trunc_limit_1
    ser_score.loc[ser_score > flo_trunc_limit_1] = flo_trunc_limit_1
    ### First limit precence marker:
    ser_on_limit = (ser_score.abs() == flo_trunc_limit_1)
    ### Under the limit values marking:
    ser_off_limit = (ser_score.abs() != flo_trunc_limit_1)
    ### Separating truncated values to perform further transformations with under the limit values only:
    ser_score_trunc_1 = ser_score.copy()
    ser_score_trunc_1.loc[ser_off_limit] = 0.0
    ### Dropping truncaterd values for further performance:
    ser_score.loc[ser_on_limit] = np.NaN
    ### Repeated statistics calculation:
    flo_std_2 = np.nanstd(ser_score, axis = 0, ddof = 1)
    flo_mean_2 = np.nanmean(ser_score, axis = 0)
    ### Constant vector checking:
    if np.isclose(flo_std_2, 0.0):
        ser_score = ser_score - ser_score
    else:
        ### Second z-scoring:
        ser_score = (ser_score - flo_mean_2) / flo_std_2
    ### Dropping truncaterd values for further performance:
    ser_score.loc[ser_on_limit] = np.NaN
    ### Second winsorization step:  
    ser_score.loc[ser_score < (-1.0 * flo_trunc_limit_2)] = -1.0 * flo_trunc_limit_2
    ser_score.loc[ser_score > flo_trunc_limit_2] = flo_trunc_limit_2
    ### Preparing for truncated values adding:
    ser_score.loc[ser_on_limit] = 0.0
    ### Vectors union:
    ser_score = ser_score + ser_score_trunc_1
    ### Final demean:
    flo_mean = np.nanmean(ser_score, axis = 0)  
    ser_score = ser_score - flo_mean
    ### Results output:
    return ser_score, flo_std_1, flo_mean_1, flo_std_2, flo_mean_2

In [12]:
### DEFINING PRELIMINARY DATA EXTRACTION (SHOULKD BE SUBSTITUTED BY SQL QUERY)

def get_history_window(iter_date, ser_source_raw, int_extended_win):
    ### Start date for source vectors:
    date_source_start = pd.to_datetime('1992-01-01')     
    ### Start date for extended window defining:
    date_start_loc = np.maximum(iter_date - pd.tseries.offsets.BDay(int_extended_win), date_source_start)
    ### Datasource for particular date (should be substituted by SQL query):    
    ser_history_raw = ser_source_raw.loc[date_start_loc : iter_date, All]
    ### Results output:
    return ser_history_raw

In [13]:
### DEFINING FILTERING DATE INTERVAL, REINDEXING FILTERED VECTOR TO BUSINESS DATES/MONTHS FREQUENCY AND FILLING DATA (PART OF THE PRODUCT CODE)

def get_country_interval(ser_filtered, date_start, date_end, int_fill_limit = 1):
    ### Drop level to avoid stack/unstack manipulations:
    ser_filtered = ser_filtered.droplevel('Country')
    ### Business day filter:
    idx_date_business = pd.date_range(start = date_start, end = date_end, freq = 'B')
    try:
        ### Frequency checker:
        date_first = ser_filtered.first_valid_index()
        date_last = ser_filtered.last_valid_index()
        ### Resampling to business month:
        if ((date_last - date_first).days / len(ser_filtered.dropna().index) > 3.0):          
            ser_filtered = ser_filtered.resample('MS').last().resample('BM').last()
    except TypeError:
        pass
    except ZeroDivisionError:
        pass
    ### Reindexation and forward filling:
    ser_reindexed = ser_filtered.resample('B').ffill().fillna(method = 'ffill', limit = int_fill_limit).reindex(idx_date_business).ffill(limit = int_fill_limit)        
    ### Results output:
    ser_reindexed.index.names = ['Date']        
    return ser_reindexed

In [14]:
### DEFINING FILTERING DATE INTERVAL, REINDEXING FILTERED VECTOR TO BUSINESS DATES/MONTHS FREQUENCY AND FILLING DATA (PART OF THE PRODUCT CODE)

def get_monthly_interval(ser_filtered, date_start, date_end, int_fill_limit = 1):
    ### Drop level to avoid stack/unstack manipulations:
    ser_filtered = ser_filtered.droplevel('Country')
    ### Business day filter:
    idx_date_business = pd.date_range(start = date_start, end = date_end, freq = 'BM')
    try:
        ### Frequency checker:
        date_first = ser_filtered.first_valid_index()
        date_last = ser_filtered.last_valid_index()
        ### Resampling to business month:      
        ser_filtered = ser_filtered.resample('MS').last().resample('BM').last()
    except TypeError:
        pass
    except ZeroDivisionError:
        pass
    ### Reindexation and forward filling:
    ser_reindexed = ser_filtered.reindex(idx_date_business).ffill(limit = int_fill_limit)        
    ### Results output:
    ser_reindexed.index.names = ['Date']        
    return ser_reindexed

In [15]:
### DEFINING MEAN MOMENTUM FUNCTION (PART OF THE PRODUCT CODE)

def mean_momentum(ser_country_source, list_weight, int_mean_min):
    try:
        ### Weight setting
        ser_weight = pd.Series(list_weight[ -len(ser_country_source.index) : ], ser_country_source.index)  
        ### Weighted mean calculation:
        return weighted_average(ser_country_source, ser_weight, int_mean_min)
    except KeyError:
        return np.NaN

In [16]:
def get_level_factor(ser_source_adopted):
    ### Last not empty value extraction:
    ser_factor = ser_source_adopted.dropna().groupby('Country').apply(lambda ser_group: ser_group.values[-1])
    ### Resulsts output:
    return ser_factor

In [17]:
### DEFINING CHANGE FACTOR FACTOR CREATING FUNCTION (PART OF THE PRODUCT CODE)

def get_change_factor(ser_source_adopted, int_mom_win, int_mom_min, int_mom_hl):
    ### Forward filling limitation:
    int_fill_limit = 66
    ### Start date for source vectors:
    date_source_start = pd.to_datetime('1991-12-31')     
    ### Start date for window defining:    
    date_start_win = np.maximum(iter_date - pd.tseries.offsets.BDay(int_mom_win - 1), date_source_start)    
    ### Weights array:
    list_weight = list(map(lambda iter_num: exp_weight_single(int_mom_hl, iter_num), range(int_mom_win)))[::-1]    
    ### Data source resampling:
    ser_data = ser_source_adopted.groupby('Country').apply(get_country_interval, date_start_win, iter_date, int_fill_limit).swaplevel().sort_index()

    ### Source performing:
    ser_delta = ser_data.groupby('Country').diff() / ser_data.groupby('Country').shift()   
    ser_delta = ser_delta.replace([np.inf, -np.inf], np.NaN)    
    ser_delta.index.names = ['Date', 'Country'] 
    ### Momentum factor calculation:
    ser_factor = ser_delta.groupby('Country').apply(mean_momentum, list_weight, int_mom_min)
    ### Results output:
    return ser_factor

In [18]:
### DEFINING CHANGE FACTOR FACTOR CREATING FUNCTION (PART OF THE PRODUCT CODE)

def get_change_factor_only(ser_data, list_weight, int_mom_min):
    ### Momentum factor calculation:
    ser_factor = ser_data.groupby('Country').apply(mean_momentum, list_weight, int_mom_min)
    ### Results output:
    return ser_factor

In [19]:
def std_recover_demean_std(ser_factor_raw):
    ### If factor have values:
    if (ser_factor_raw.count() > 0):
        ### Outliers clipping through the two step standartizing and recovering:
        (ser_factor_raw_std_prelim, flo_std_1, flo_mean_1, flo_std_2, flo_mean_2) = td_two_stage_standardize_extended(ser_factor_raw)
        ser_factor_rec = (ser_factor_raw_std_prelim * flo_std_2 + flo_mean_2) * flo_std_1 + flo_mean_1  
        ### By market demeaning for clipped data vector:
        ser_factor_rec = ser_factor_rec.groupby('Market').apply(lambda ser_region: ser_region - ser_region.mean())
        ### Winsorized & demeaned by market data vector standartizing and saving:
        ser_factor_std = td_two_stage_standardize_extended(ser_factor_rec)[0]
    ### If factor is all NaN:
    else:
        ser_factor_std = ser_factor_raw
    ### Results output:
    return ser_factor_std

In [32]:
### DATA LOADING (TO BE IGNORED IN PRODUCT CODE)

ser_ison_daily = ison_membership_converting(str_path_universe, datetime.strptime(str_measure_date_end, '%Y-%m-%d'), bool_daily = True) ### ISON universe, bus-daily vector
dict_source_raw = {}
### PRS pillars table to use as a factor data source:
dict_source_raw['PRS'] = pd.read_hdf(str_path_prs_hdf, key = str_key_prs_pillars_only_converted)\
                           .loc[['Economic Risk Rating', 'Financial Risk Rating', 'Political Risk Rating'], All, All]
dict_source_raw['PRS'].index.set_names('Pillar', level = 'Variable', inplace = True)
### Continuum pillars table to use as a factor data source:
dict_source_raw['Continuum'] = pd.read_hdf(str_path_continuum_hdf, key = str_key_continuum_politics_converted)\
                                 .loc[['External Adjustment Capacity', 'Institutional Robustness', 'Medium-Term Growth Potential', 'Social Inclusion'], All, All]
dict_source_raw['Continuum'].index.set_names('Pillar', level = 'Indicator', inplace = True)
### Weights loading:
ser_weights = pd.read_excel(io = str_path_weights_xlsx, sheet_name = 0, header = 0, index_col = [0, 1, 2],
                                 na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                             '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False).astype(float).squeeze().sort_index()

In [21]:
### TESTING: PERFORMING FACTOR FOR DATE RANGE (TO BE IGNORED IN PRODUCT CODE)

### Removing csv files before loop running:
if (os.path.exists(str_test_autocorr_csv)):
    os.remove(str_test_autocorr_csv)   
if (os.path.exists(str_test_factor_full_csv)):
    os.remove(str_test_factor_full_csv) 
if (os.path.exists(str_test_factor_source_csv)):
    os.remove(str_test_factor_source_csv)     
if (os.path.exists(str_test_factor_agg_csv)):
    os.remove(str_test_factor_agg_csv)
### THA-calculation constants:
flo_similarity = 5 * (10 ** (-6))   
flo_tha_ratio = 0.9 ### THA progression ratio
int_tha_length = 24 ### THA horizon length  
### Change factor parameters:
int_extended_win = 260 * 6 ### Extended window length, business days
int_fill_limit = 3 # 66 ### Forward filling limitation
int_mom_win = 60 # 1300 ### Rolling window length
int_mom_hl_short = 3 # 66 ### Half-life and Minimal rolling window length for short factor
int_mom_hl_long = 24 # 520 ### Half-life and Minimal rolling window length for long factor
date_source_start = pd.to_datetime('1992-01-01') ### Start date for source vectors
list_weight_short = list(map(lambda iter_num: exp_weight_single(int_mom_hl_short, iter_num), range(int_mom_win)))[::-1] ### Weights for short change
list_weight_long = list(map(lambda iter_num: exp_weight_single(int_mom_hl_long, iter_num), range(int_mom_win)))[::-1] ### Weights for long change
### Local testing parameters:
int_interval = 10 ### Interval of progress displaying
date_start = datetime.utcnow() ### Start time of calculations
date_control = datetime.utcnow() ### Control time to display
idx_test_date_range = idx_test_monthly_date_range[0 : 1] # idx_test_monthly_date_range # idx_test_monthly_date_range[0 : 10] # idx_test_monthly_date_range[0 : 60] # 
#idx_test_date_range = idx_test_daily_date_range[0: 100]
### Test performing:
print('Start time:', date_start)
for iter_num, iter_date in enumerate(idx_test_date_range):
    ### Progress printing:
    if not (divmod(iter_num, int_interval)[1]):
        if iter_num:
            print('Counter marker:', iter_num, '/', len(idx_test_date_range))
            timedelta_interval = datetime.utcnow() - date_control
            print('Time interval since last marker:', datetime.utcnow() - date_control)            
            print('Average interval for single date:', str(timedelta_interval / int_interval))
        date_control = datetime.utcnow()
    ### ISON daily vector loading:
    ser_ison_iter_date = ser_ison_daily.loc[iter_date, All].droplevel('Date')
    ### Start date for window defining:    
    date_start_win = np.maximum(iter_date - pd.tseries.offsets.BMonthEnd(int_mom_win - 1), date_source_start)  
    ### Sources looping:
    for iter_source in dict_source_raw:
        ### Source data portion loading:
        ser_iter_adopted = dict_source_raw[iter_source].groupby('Pillar')\
                                                      .apply(lambda ser_pillar: get_history_window(iter_date, ser_pillar.droplevel('Pillar'), int_extended_win))
        ### Check for not empty source vector:
        if (len(ser_iter_adopted) > 0):
            ### Data source resampling:
            ser_iter_interval = ser_iter_adopted.groupby(['Pillar', 'Country'])\
                                            .apply(lambda ser_country: get_monthly_interval(ser_country.droplevel('Pillar'), date_start_win, iter_date, int_fill_limit))\
                                            .swaplevel().sort_index()
            ### Raw factors calculating:
            ser_iter_level_factor = ser_iter_adopted.groupby('Pillar').apply(lambda ser_pillar: get_level_factor(ser_pillar.droplevel('Pillar')))
            ser_iter_level_factor.name = 'Level'
            ser_iter_long_factor = ser_iter_interval.groupby('Pillar')\
                                                    .apply(lambda ser_pillar: get_change_factor_only(ser_pillar.droplevel('Pillar'), list_weight_long, int_mom_hl_long))
            ser_iter_long_factor = ser_iter_level_factor - ser_iter_long_factor
            ser_iter_long_factor.name = 'Long'            
            ser_iter_short_factor = ser_iter_interval.groupby('Pillar')\
                                                     .apply(lambda ser_pillar: get_change_factor_only(ser_pillar.droplevel('Pillar'), list_weight_short, int_mom_hl_short))
            ser_iter_short_factor = ser_iter_level_factor - ser_iter_short_factor            
            ser_iter_short_factor.name = 'Short' 
            ### Deleting change factors for all Continuum pillars:
            if (iter_source == 'Continuum'):
                ser_iter_long_factor.loc[All] = np.NaN
                ser_iter_short_factor.loc[All] = np.NaN                
            ### Adding region information:
            df_iter_factor_raw = pd.concat([ser_iter_level_factor, ser_iter_short_factor, ser_iter_long_factor], axis = 1).join(ser_ison_iter_date, how = 'left')\
                                                                                                                          .set_index('Market', append = True)
            ### Regions clearing:
            df_iter_factor_raw = df_iter_factor_raw.loc[(All, All, list_ison), All]
            ### Countries filtering:
            df_iter_factor_raw = df_iter_factor_raw.drop(list_countries_to_exclude, level = 'Country')
            ### Preparing for two step standardizing:
            ser_iter_factor_raw = df_iter_factor_raw.stack(dropna = False)
            ser_iter_factor_raw.index.set_names('Factor', level = -1, inplace = True)
            ser_iter_factor_raw = ser_iter_factor_raw.reorder_levels(['Pillar', 'Factor', 'Country', 'Market'])
            ser_iter_factor_raw.name = 'Raw'
            ### Standardize -> Recover -> Demean by Region -> Standardize again:
            ser_iter_factor_std = ser_iter_factor_raw.groupby(['Pillar', 'Factor'])\
                                                     .apply(lambda ser_factor_raw: std_recover_demean_std(ser_factor_raw.droplevel(['Pillar', 'Factor'])))
            ser_iter_factor_std.name = 'Std'
            ### Object to save:
            df_iter_factor_to_save = pd.concat([ser_iter_factor_raw, ser_iter_factor_std], axis = 1)
            df_iter_factor_to_save = pd.concat({iter_source: pd.concat({iter_date: df_iter_factor_to_save}, names = ['Date'])}, names = ['Source'])\
                                       .reorder_levels(['Source', 'Pillar', 'Factor', 'Date', 'Country', 'Market'])         
            ### Saving source factors data to the table:
            df_iter_factor_to_save.to_csv(str_test_factor_full_csv, sep = ';', mode = 'a', header = not os.path.exists(str_test_factor_full_csv))
            ### Autocorrection calculating for business-end-of-month:
            if (iter_date == (iter_date + pd.tseries.offsets.BMonthEnd(0))):
                date_prev = iter_date - pd.tseries.offsets.BMonthEnd(1)
                ### Extract previous tha-factor vector from csv file (should be substituted by SQL query):
                if (os.path.exists(str_test_autocorr_csv)):
                    ser_iter_factor_std_prev = pd.read_csv(str_test_factor_full_csv, sep = ';', index_col = list(range(6)), header = 0, squeeze = True, parse_dates = [3],
                                                           na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                                                        '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)\
                                                 .loc[([iter_source], All, All, [date_prev], All, All), 'Std'].droplevel(['Source', 'Date'])
                else:
                    ser_iter_factor_std_prev = pd.Series(np.NaN, index = ser_iter_factor_std.index)
                ser_iter_factor_std.name = 'Current'
                ser_iter_factor_std_prev.name = 'Previous'    
                ### Autocorrelation calculating:
                df_iter_history = pd.concat([ser_iter_factor_std, ser_iter_factor_std_prev], axis = 1)
                ser_iter_autocorr = df_iter_history.groupby(['Pillar', 'Factor', 'Market'], group_keys = False)\
                                                  .apply(lambda df_market: df_market['Current'].corr(df_market['Previous']))
                ### Autocorrelation coefficients calculating and saving (should be substituted by SQL query):
                ser_iter_autocorr_dated = pd.concat({iter_source: pd.concat({iter_date: ser_iter_autocorr}, names = ['Date'])}, names = ['Source'])\
                                            .reorder_levels(['Source', 'Pillar', 'Factor', 'Date', 'Market'])            
                ser_iter_autocorr_dated.name = 'Autocorr'
                ser_iter_autocorr_dated.to_csv(str_test_autocorr_csv, sep = ';', mode = 'a', header = not os.path.exists(str_test_autocorr_csv))
            ### Extract autocorr data (should be substituted by SQL query):
            ser_autocorr_vector = pd.read_csv(str_test_autocorr_csv, sep = ';', index_col = [0, 1, 2, 3, 4], header = 0, squeeze = True, parse_dates = [3],
                                              na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                                           '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)\
                                       .loc[[iter_source], All, All, All, All].droplevel('Source')
            ### First month-end check:            
            if (ser_autocorr_vector.count() > 0):
                ### Excluding ones (repeated values):
                ser_autocorr_vector.loc[ser_autocorr_vector >= (1.0 - flo_similarity)] = np.NaN                
                ### Autocorrelation mean:
                ser_autocorr_cum_mean = ser_autocorr_vector.groupby(['Pillar', 'Factor', 'Market'], group_keys = False).mean().clip(lower = 0.0)
                ### Quarterly mean converting to monthly:
                if (iter_source == 'Continuum'):
                    ser_autocorr_cum_mean = ser_autocorr_cum_mean ** (1 / 3)
                ### THA-coefficient calculating:                       
                ser_tha_coeff = ser_autocorr_cum_mean\
                                        .transform(lambda iter_mean: sum(map(lambda iter_num: (flo_tha_ratio * iter_mean) ** iter_num, range(int_tha_length))) / 2)
            else:
                ser_tha_coeff = ser_autocorr_vector.droplevel('Date')
            ### Adding empty Markets to further fillna and fillna (first date of new region appearance):
            ser_tha_coeff = ser_tha_coeff.unstack(['Pillar', 'Factor']).reindex(['DM', 'EM', 'FM']).stack(['Pillar', 'Factor'], dropna = False)\
                                         .reorder_levels(['Pillar', 'Factor', 'Market']).fillna(2.0).sort_index()
            ser_tha_coeff.name = 'THA_Coeff'
            ### THA-adjusted z-score calculating:
            ser_iter_factor_std.name = 'Factor'
            df_iter_factor_tha = ser_iter_factor_std.to_frame().join(ser_tha_coeff).sort_index()
            ser_iter_factor_tha = df_iter_factor_tha['Factor'] * df_iter_factor_tha['THA_Coeff']
            ### Consolidated factor for each pillar of the source:
            ser_iter_factor_agg = ser_iter_factor_tha.unstack('Factor').groupby('Pillar', group_keys = False).apply(columns_average)
            ### Source level consolidating and saving:
            if (len(ser_iter_factor_agg.index.get_level_values('Market').unique()) > 1):
                ser_iter_source = ser_iter_factor_agg.unstack('Pillar').groupby('Market', group_keys = False)\
                                                     .apply(lambda df_region: columns_average(df_region, ser_weights.loc[iter_source, All, df_region.index[0][0]].values))
            ### If we have single region for particular date:
            else:
                df_region = ser_iter_factor_agg.unstack('Pillar') 
                ser_iter_source = columns_average(df_region, ser_weights.loc[iter_source, All, df_region.index[0][0]].values)
            ### Source level saving:    
            ser_iter_source_dated = pd.concat({iter_source: pd.concat({iter_date: ser_iter_source.sort_index(level = 'Country')}, names = ['Date'])}, names = ['Source'])\
                                      .reorder_levels(['Source', 'Date', 'Country', 'Market'])
            ser_iter_source_dated.name = 'Source_Factor'
            ser_iter_source_dated.to_csv(str_test_factor_source_csv, sep = ';', mode = 'a', header = not os.path.exists(str_test_factor_source_csv))
    ### Source factors aggregating, z-scoring and saving:
    df_iter_full = pd.read_csv(str_test_factor_source_csv, sep = ';', index_col = [0, 1, 2, 3], header = 0, squeeze = True, parse_dates = [1],
                           na_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', 
                                        '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null'], keep_default_na = False)\
                                       .loc[All, [iter_date], All, All].unstack('Source')
    ser_iter_full = columns_average(df_iter_full).groupby('Market').apply(lambda df_market: td_two_stage_standardize_extended(df_market)[0])
    ser_iter_full.name = 'Consolidated_Factor'
    ser_iter_full.to_csv(str_test_factor_agg_csv, sep = ';', mode = 'a', header = not os.path.exists(str_test_factor_agg_csv))
    
date_finish = datetime.utcnow()
### Overall statistics printing:
print('Finish time:', date_finish)
print('Full interval:', date_finish - date_start)
print('Average interval for single date:', str((date_finish - date_start) / len(idx_test_date_range)))

Start time: 2021-05-05 10:34:58.981387
Finish time: 2021-05-05 10:35:01.736368
Full interval: 0:00:02.754981
Average interval for single date: 0:00:02.754981


In [30]:
### TEMP

ser_iter_factor_agg_test = ser_iter_factor_tha.unstack('Pillar').groupby('Factor', group_keys = False).apply(columns_average)
ser_iter_factor_agg_test

Factor  Market  Country
Level   DM      AT         1.479758
                AU        -1.114699
                BE        -0.390645
                CA         0.227383
                CH         3.106888
                             ...   
Short   DM      NO         0.070196
                NZ         0.354459
                SE         0.791418
                SG         0.849237
                US         0.218713
Length: 63, dtype: float64

In [33]:
### TEMP

ser_weights

Source     Pillar                        Region
Continuum  External Adjustment Capacity  DM        3.0
                                         EM        3.0
                                         FM        3.0
           Institutional Robustness      DM        1.0
                                         EM        1.0
                                         FM        1.0
           Medium-Term Growth Potential  DM        1.0
                                         EM        1.0
                                         FM        1.0
           Social Inclusion              DM        1.0
                                         EM        1.0
                                         FM        1.0
PRS        Economic Risk Rating          DM        1.0
                                         EM        1.0
                                         FM        1.0
           Financial Risk Rating         DM        1.0
                                         EM        1.0
                 